In [1]:
import numpy as np
import pandas as pd

In [2]:
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.25
batch_size = 128
epochs = 20

In [3]:
word2vec = {}
with open(f'glove.6B.{EMBEDDING_DIM}d.txt', encoding='utf-8') as f:
    print(f.name)
    for line in f:
        parts = line.split()
        word = parts[0]
        vector = np.asarray(parts[1:], dtype='float32')
        word2vec[word] = vector
print('Total {} word vectors collected in the dictionary'.format(len(word2vec)))

glove.6B.100d.txt
Total 400000 word vectors collected in the dictionary


In [4]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
train_df.iloc[0][1]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [6]:
train_df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [9]:
train_df.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [11]:
train_df.shape

(159571, 8)

In [8]:
comments = train_df['comment_text'].fillna('NO_COMMENT').values
label_list = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
targets = train_df[label_list].values

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)

print('Maximum sequence length: {}'.format(max(len(seq) for seq in sequences)))
print('Minimum sequence length: {}'.format(min(len(seq) for seq in sequences)))

word2index = tokenizer.word_index
print('Total {} unique tokens found'.format(len(word2index)))

word_matrix = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of word matrix: {}'.format(word_matrix.shape))

Using TensorFlow backend.


Maximum sequence length: 1400
Minimum sequence length: 0
Total 210337 unique tokens found
Shape of word matrix: (159571, 100)


In [12]:
word_count = min(MAX_VOCAB_SIZE, len(word2index)+1)
embedding_matrix = np.zeros((word_count, EMBEDDING_DIM))
for word, index in word2index.items():
    if index < MAX_VOCAB_SIZE:
        vector = word2vec.get(word)
        if vector is not None:
            embedding_matrix[index]=vector

In [16]:
from keras.models import Input, Model
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D

In [14]:
embedding_layer = Embedding(word_count, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False, input_length=MAX_SEQUENCE_LENGTH)

In [17]:
input_comment = Input((MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_comment)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3 , activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
output = Dense(len(label_list), activation='sigmoid')(x)

model = Model(input_comment, output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          2000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 128)           38528     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 30, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 10, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 8, 128)            4928

In [18]:
history = model.fit(word_matrix, targets, batch_size=batch_size, epochs=epochs, validation_split=VALIDATION_SPLIT)

Train on 119678 samples, validate on 39893 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
