This code uses the skipgram model to create word2vec encodings and then use those encodings to represent the input words to our model.

In [None]:
import numpy as np
import pandas as pd

from tensorflow import one_hot
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence

from sklearn.metrics import roc_auc_score

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 35000
# Max number of words in each comment.
MAX_SEQUENCE_LENGTH = 200
#Number of times to duplicate identity_hate
repeats=1 
#Size of the word embeddings
EMBEDDING_DIM=100

In [None]:
train = pd.read_csv('spellcheck.csv')
train = train.replace(np.nan, '', regex=True)

X_train_text = train["comment_text"].values
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train_text))
word_index = tokenizer.word_index

count_list = list(tokenizer.word_counts.items())
counts=[count_list[i][1] for i in range(0,len(count_list))]

train = train.sample(frac=1,random_state=13)
val=train.tail(int(len(train)*1/10))
train=train.head(int(len(train)*9/10))

train_idhate = train[train['identity_hate'] == 1].sample(n=500,random_state=13)
train_threat = train[train['threat'] == 1].sample(n=300,random_state=13)
train = pd.concat([train, train_idhate])
train = pd.concat([train, train_threat])

train = train.sample(frac=1,random_state=25)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[list_classes].to_numpy()
X_train = tokenizer.texts_to_sequences(train["comment_text"].values)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_val = val[list_classes].to_numpy()
X_val = tokenizer.texts_to_sequences(val["comment_text"].values)
X_val = sequence.pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
def word2vec (x,X,Y,width=2):
    for i in range(MAX_SEQUENCE_LENGTH):
        if i<width:
            for j in range(1,width+1):
                new_x=x[i]
                new_y=x[i+j]
                if new_x and new_y:
                    X.append(new_x)
                    Y.append(new_y)
            for j in range(-1,-(width+1),-1):
                if (j+i)<0:
                    break
                new_x=x[i]
                new_y=x[i+j]
                if new_x and new_y:
                    X.append(new_x)
                    Y.append(new_y)
        elif i>(MAX_SEQUENCE_LENGTH-1-width):
            for j in range(1,width+1):
                if (j+i)>(MAX_SEQUENCE_LENGTH-1):
                    break
                new_x=x[i]
                new_y=x[i+j]
                if new_x and new_y:
                    X.append(new_x)
                    Y.append(new_y)
            for j in range(-1,-(width+1),-1):
                new_x=x[i]
                new_y=x[i+j]
                if new_x and new_y:
                    X.append(new_x)
                    Y.append(new_y)
        else:
            for j in range(1,width+1):
                new_x=x[i]
                new_y=x[i+j]
                if new_x and new_y:
                    X.append(new_x)
                    Y.append(new_y)
            for j in range(-1,-(width+1),-1):
                new_x=x[i]
                new_y=x[i+j]
                if new_x and new_y:
                    X.append(new_x)
                    Y.append(new_y)
    return 0     

In [None]:
w2vX=[]
w2vY=[]
for i in X_train:
    word2vec(i,w2vX,w2vY,2)

w2vX=np.array(w2vX)
w2vY=np.array(w2vY)
w2vX=np.reshape(w2vX,(w2vX.shape[0],1))

In [None]:
epochs = 2
batch_size = 256

word2vec_model = keras.Sequential()
word2vec_model.add(layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=1,mask_zero=True))
word2vec_model.add(layers.Dense(MAX_NB_WORDS,activation='softmax'))

word2vec_model.compile(loss='sparse_categorical_crossentropy',optimizer='adam')

history = word2vec_model.fit(w2vX,w2vY, epochs=epochs, batch_size=batch_size,validation_split=0.05,callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
del w2vX
del w2vY

After training the word2vec embeddings, we can use those embeddings as the first layer into our LTSM model.

In [None]:
epochs=10
batch_size=256

embedding_layer = layers.Embedding(MAX_NB_WORDS,
                            EMBEDDING_DIM,
                            weights=word2vec_model.layers[0].get_weights(),
                            input_length=MAX_SEQUENCE_LENGTH)

model = keras.Sequential()
model.add(embedding_layer)
model.add(layers.SpatialDropout1D(dropout))
model.add(layers.Bidirectional(layers.LSTM(int(rnn_size), return_sequences=True)))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(6, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy','AUC'])

        
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_data=(X_val, y_val),callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, min_delta=0.0030,restore_best_weights=False)])

out=roc_auc_score(y_val, model.predict(X_val))
print(out)