This code uses the skipgram model to create word2vec encodings and then use those encodings to represent the input words to our model.

In [None]:
import numpy as np
import pandas as pd

from tensorflow import one_hot
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence

from sklearn.metrics import roc_auc_score

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 35000
# Max number of words in each comment.
MAX_SEQUENCE_LENGTH = 200
#Number of times to duplicate identity_hate
repeats=1 
#Size of the word embeddings
EMBED_SIZE=300

First we load in the data. We are using the preprocessed data we created so we avoid redoing the processing every time. 

In [None]:
train = pd.read_csv('spellcheck.csv')
train = train.replace(np.nan, '', regex=True)

X_train_text = train["comment_text"].values
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train_text))
word_index = tokenizer.word_index

count_list = list(tokenizer.word_counts.items())
counts=[count_list[i][1] for i in range(0,len(count_list))]

train = train.sample(frac=1,random_state=13)
val=train.tail(int(len(train)*1/10))
train=train.head(int(len(train)*9/10))

train_idhate = train[train['identity_hate'] == 1].sample(n=500,random_state=13)
train_threat = train[train['threat'] == 1].sample(n=300,random_state=13)
train = pd.concat([train, train_idhate])
train = pd.concat([train, train_threat])

train = train.sample(frac=1,random_state=25)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[list_classes].to_numpy()
X_train = tokenizer.texts_to_sequences(train["comment_text"].values)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_val = val[list_classes].to_numpy()
X_val = tokenizer.texts_to_sequences(val["comment_text"].values)
X_val = sequence.pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
embeddings_index = {}
f = open('glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
EMBED_SIZE = 300
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
epochs = 10
batch_size = 256

embedding_layer = layers.Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)  

cnn_model = keras.Sequential()
cnn_model.add(embedding_layer)
cnn_model.add(layers.Conv1D(32, 3, activation='relu', padding='same'))
cnn_model.add(layers.MaxPooling1D(2))
cnn_model.add(layers.Conv1D(32, 3,activation='relu', padding='same'))
cnn_model.add(layers.MaxPooling1D(2))
cnn_model.add(layers.Conv1D(32, 3,activation='relu', padding='same'))
cnn_model.add(layers.MaxPooling1D(2))
cnn_model.add(layers.Conv1D(32, 3,activation='relu', padding='same'))
cnn_model.add(layers.GlobalMaxPooling1D())
cnn_model.add(layers.Dropout(0.1))
cnn_model.add(layers.Dense(6, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.AUC(multi_label=True)])

cnn_model.summary()

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_data=(X_val, y_val),callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, min_delta=0.0030,restore_best_weights=False)])


In [None]:
cnn_model.layers[0].trainable=True

Some of the embedding layer is still missing weights (i.e all 0's) so want to do some fine tuning to fix that. Must note that this does cause a little overfitting but it may lead to some increased model rubustness.

In [None]:
Second_epochs=1
lr=0.000001
opt = keras.optimizers.Adam(learning_rate=lr)
cnn_model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.AUC(multi_label=True)])
history = cnn_model.fit(X_train, y_train, epochs=Second_epochs, batch_size=batch_size,validation_split=0.1)

out=roc_auc_score(y_val, model.predict(X_val))
print(out)