## Keras implementation of Yoon Kim's model for sentence classification

##### The following is a keras implementation of Yoon Kim's convolutional neural network model for sentence classification from the paper: https://arxiv.org/abs/1408.5882

In [1]:
import pandas as pd
import re
import numpy as np
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.models import Model
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import Dense, Flatten, GlobalMaxPooling1D, Activation, Dropout, GaussianNoise
from keras.layers import Embedding, Input, BatchNormalization, SpatialDropout1D, Conv1D
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from IPython.display import display
import itertools
from nltk.corpus import words
%matplotlib inline
import matplotlib.pyplot as plt

Using TensorFlow backend.


##### Determine dimension of embedding vector, max size of vocabulary and max length of sentence (crop the rest)

In [2]:
# Set parameters
embed_size   = 50    # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen       = 100   # max number of words in a comment to use 

##### Load data...

In [3]:
# Load data
train=pd.read_csv("./data/train.csv")
test=pd.read_csv("./data/test.csv")
X_train = train["comment_text"]
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
X_test = test["comment_text"]

In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()# lowercase text
    text = re.sub(REPLACE_BY_SPACE_RE,' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(BAD_SYMBOLS_RE,'',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)# delete stopwords from text
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qizichen1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
list_sentences_train = []
list_sentences_test = []
for text in X_train:
    list_sentences_train.append(text_prepare(text))
    
for text in X_test:
    list_sentences_test.append(text_prepare(text))

In [6]:
del X_train
del X_test

##### Tokenize sentences, convert to integers and pad sentences < 100 words

In [7]:
# Pad sentences and convert to integers
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_train = pad_sequences(list_tokenized_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen, padding='post')

##### Load "glove" pre-trained embeddings and construct vocabulary dictionary

In [8]:
# Read the glove word vectors (space delimited strings) into a dictionary from word->vector
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('./data/glove.6B/glove.6B.50d.txt'))

##### Create embedding matrix and initialize space for new words not present in "glove"

In [9]:
# Create embeddings matrix
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

# Create embedding matrix using our vocabulary
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# Initialize embedding matrix
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

# Loop through each word and get its embedding vector
for word, i in word_index.items():
    if i >= max_features: 
        continue # Skip words appearing less than the minimum allowed
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

##### Set no. of convolution filters and weigh the outcome variable in order to balance.
- 128 filters are used for each convolution. I.e. with a kernel size of 3, 128 tri grams are constructed each representing a specific feature. With a kernel size of 4, 128 4-grams are constructed and so on..

In [10]:
# Initialize parameters
conv_filters = 100 # No. filters to use for each convolution
weight_vec = list(np.max(np.sum(y, axis=0))/np.sum(y, axis=0))
class_weight = {i: weight_vec[i] for i in range(6)}

In [11]:
class_weight

{0: 1.0,
 1: 9.58871473354232,
 2: 1.8101550479346669,
 3: 31.99581589958159,
 4: 1.941602132791672,
 5: 10.885409252669039}

##### Construct Convolutional Neural Network

In [117]:
from keras.constraints import max_norm
inp = Input(shape=(X_train.shape[1],), dtype='int64')
emb = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
emb = Dropout(0.5)(emb)
# Specify each convolution layer and their kernel siz i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=1)(emb)
actv1_1 = Activation('relu')(conv1_1)
glmp1_1 = GlobalMaxPooling1D()(actv1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=2)(emb)
actv1_2 = Activation('relu')(conv1_2)
glmp1_2 = GlobalMaxPooling1D()(actv1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=3)(emb)
actv1_3 = Activation('relu')(conv1_3)
glmp1_3 = GlobalMaxPooling1D()(actv1_3)

conv1_4 = Conv1D(filters=conv_filters, kernel_size=4)(emb)
actv1_4 = Activation('relu')(conv1_4)
glmp1_4 = GlobalMaxPooling1D()(actv1_4)
# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3, glmp1_4], axis=1)
drp1 = Dropout(0.5)(cnct)
dns1  = Dense(32, activation='relu')(drp1)
out = Dense(y.shape[1],kernel_constraint=max_norm(3.), activation='sigmoid')(dns1)

In [118]:
Model(inputs=inp, outputs=out).summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_23 (Embedding)        (None, 100, 50)      1000000     input_23[0][0]                   
__________________________________________________________________________________________________
dropout_34 (Dropout)            (None, 100, 50)      0           embedding_23[0][0]               
__________________________________________________________________________________________________
conv1d_75 (Conv1D)              (None, 100, 100)     5100        dropout_34[0][0]                 
__________________________________________________________________________________________________
conv1d_76 

In [119]:
# Compile
model = Model(inputs=inp, outputs=out)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [120]:
# Estimate model
model.fit(X_train, y, validation_split=0.1, epochs=3, batch_size=50, shuffle=True, class_weight=class_weight)

Train on 143613 samples, validate on 15958 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a7fd1ac18>

##### Predict and finally submit

In [121]:
# Predict
preds = model.predict(X_test)

In [122]:
# Create submission
submid = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = list_classes)], axis=1)
submission.to_csv('conv_glove_simple_sub.csv', index=False)