In [236]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

# import helper functions
%run -i helper_functions.py

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GRU
from keras.layers import Bidirectional, GlobalMaxPool1D,GlobalMaxPool2D,Flatten,SpatialDropout1D,Conv1D,GlobalMaxPooling1D,GlobalAveragePooling1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, Sequential
from tensorflow.keras.optimizers import Adam

# Read files

In [2]:
train_features = pd.read_csv("./Data/selected_train.csv")
test_features = pd.read_csv("./Data/selected_test.csv")
embedding = pd.read_csv("./Data/embeddings_glove.csv")

In [3]:
cleaned_train = pd.read_csv("./Data/cleaned_train.csv")
test = pd.read_csv("./Data/test.csv")
test_labels = pd.read_csv("Data/test_labels.csv")
print(test_labels.shape)
test_labels.head()

(153164, 7)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [4]:
# join text data and labels
test_labeled = pd.concat([test, test_labels.drop('id', axis=1)], axis=1)
masking = (test_labeled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]==-1).sum(axis=1)==0
test_labeled = test_labeled[masking].reset_index(drop=True)
test_cleaned = data_cleaning(test_labeled)

removing noise
further cleaning the text


Mainly try out Bidirectional-LSTM, LSTM, and GRU models.
Most LSTM models only train on texts. 

# Bi-LSTM on clean text

## Define parameters

In [5]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

## Tokenize and sequence train and test clean texts

In [40]:

train_y = train_features[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

train_x = cleaned_train['clean_text']

test_x = test_cleaned['clean_text']


# Vectorize text + Prepare GloVe Embedding
tokenizer = Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(train_x))

train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

train_x = pad_sequences(train_x, maxlen=maxlen)
test_x = pad_sequences(test_x, maxlen=maxlen)
test_y = test_cleaned[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values


## Embedding matrix according to GloVe (How to extract from ours?)

In [7]:
EMBEDDING_FILE = "Data/glove.6B.50d.txt"
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

In [9]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

## Hyperparameter tuning

### Define inital model

In [151]:
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
def build_model(hp):          #hp means hyper parameters
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    #providing range for number of neurons in a hidden layer
    x = Dense(units=hp.Int('num_of_neurons1',min_value=20,max_value=60,step=10), activation="relu")(x)
    x = Dropout(0.1)(x)
    #output layer
    x = Dense(6, activation="sigmoid")(x)
    #compiling the model
    model = Model(inputs=inp, outputs=x)
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate',values=[1e-2, 1e-3, 1e-4])),loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [152]:
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy', #avoid overfitting
                     max_epochs=5,
                     factor=3,
                     directory='tuner',
                     project_name='LSTM')
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

INFO:tensorflow:Reloading Tuner from tuner/LSTM/tuner0.json


### Search

In [32]:
tuner.search(train_x,train_y,epochs=5, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('num_of_neurons1')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Trial 24 Complete [01h 29m 52s]
val_accuracy: 0.9939842224121094

Best val_accuracy So Far: 0.9940468668937683
Total elapsed time: 12h 56m 53s
INFO:tensorflow:Oracle triggered exit

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 20 and the optimal learning rate for the optimizer
is 0.001.



### Build model with best params

In [153]:
# Build and train model with best params
model_tuned = tuner.hypermodel.build(best_hps)
history = model_tuned.fit(train_x,train_y, epochs=5, validation_split=0.1, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [165]:
# fine best model using different scoring
def build_best_model(score):
    val_acc_per_epoch = history.history[score]
    best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
    print('Best epoch: %d' % (best_epoch,))
    hypermodel = tuner.hypermodel.build(best_hps)

    # Retrain the model
    hypermodel.fit(train_x,train_y, epochs=best_epoch, validation_split=0.1, batch_size = 32)
    return hypermodel

In [166]:
# For those above threshold, could be consider as 1
def multi_label(predictions):
    threshold = 0.5
    y_pred=[]
    for sample in predictions:
        y_pred.append([1 if i>=0.5 else 0 for i in sample ] )
    y_pred = np.array(y_pred)
    return y_pred

#### Using valuation accuracy

In [154]:
hypermodel_val_accuracy = build_best_model('val_accuracy')

Best epoch: 1


<keras.callbacks.History at 0x7f87094b5550>

In [218]:
predictions_val_accuracy_prob = hypermodel_val_accuracy.predict(test_x,batch_size=1024, verbose=1)
y_pred_val_accuracy = multi_label(predictions_val_accuracy_prob)
get_evaluation_score(test_y, y_pred_val_accuracy,predictions_val_accuracy_prob ) 

Accuracy score:  0.8687986495357779
Precision score:  0.5582364744199816
Recall score:  0.737550006897503
F1 score:  0.612590956322527
Confusion matrix for label toxic:
[[53211  4677]
 [  738  5352]]
Confusion matrix for label severe_toxic:
[[63544    67]
 [  308    59]]
Confusion matrix for label obscene:
[[58467  1820]
 [  750  2941]]
Confusion matrix for label threat:
[[63767     0]
 [  211     0]]
Confusion matrix for label insult:
[[58952  1599]
 [ 1088  2339]]
Confusion matrix for label identity_hate:
[[63264     2]
 [  710     2]]


  _warn_prf(average, modifier, msg_start, len(result))


Logarithmic Loss:  0.29339151012522535
ROC AUC score:  0.970040392088635


#### Using test accuracy

In [159]:
hypermodel_accuracy =build_best_model('accuracy')

Best epoch: 2
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8714ed4e80>

In [219]:
predictions_accuracy_prob = hypermodel_accuracy.predict(test_x,batch_size=1024, verbose=1)
y_pred_accuracy = multi_label(predictions_accuracy_prob)
get_evaluation_score(test_y, y_pred_accuracy,predictions_accuracy_prob) 

Accuracy score:  0.8744412141673701
Precision score:  0.5675640777774101
Recall score:  0.7634846185680784
F1 score:  0.6452080649904857
Confusion matrix for label toxic:
[[53621  4267]
 [  839  5251]]
Confusion matrix for label severe_toxic:
[[63362   249]
 [  253   114]]
Confusion matrix for label obscene:
[[58559  1728]
 [  827  2864]]
Confusion matrix for label threat:
[[63767     0]
 [  211     0]]
Confusion matrix for label insult:
[[58725  1826]
 [  860  2567]]
Confusion matrix for label identity_hate:
[[63106   160]
 [  439   273]]


  _warn_prf(average, modifier, msg_start, len(result))


Logarithmic Loss:  0.2869767948817004
ROC AUC score:  0.9709464303607276


## Reference

### Baseline Bi-LSTM

In [225]:
embed_size = 128
inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model_baseline = Model(inputs=inp, outputs=x)
model_baseline.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
model_baseline.fit(train_x,train_y, batch_size=32, epochs=2, validation_split=0.1)

2023-03-27 22:25:43.974805: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:25:43.979920: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:25:43.987948: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/2


2023-03-27 22:25:47.070846: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:25:47.079023: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:25:47.087417: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-27 22:31:21.101889: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:31:21.108709: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:31:21.119417: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


<keras.callbacks.History at 0x7f871946b3a0>

In [226]:
y_pred_baseline_prob = model_baseline.predict(test_x, batch_size=1024, verbose=1)
y_pred_baseline = multi_label(y_pred_baseline_prob)
get_evaluation_score(test_y, y_pred_baseline,y_pred_baseline_prob)

2023-03-27 22:36:25.612053: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:36:25.616619: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:36:25.622575: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Accuracy score:  0.881693707211854
Precision score:  0.5631742457657742
Recall score:  0.6949924127465857


  _warn_prf(average, modifier, msg_start, len(result))


F1 score:  0.6192141609992836
Confusion matrix for label toxic:
[[54273  3615]
 [ 1020  5070]]
Confusion matrix for label severe_toxic:
[[63610     1]
 [  367     0]]
Confusion matrix for label obscene:
[[58845  1442]
 [  973  2718]]
Confusion matrix for label threat:
[[63767     0]
 [  211     0]]
Confusion matrix for label insult:
[[59272  1279]
 [ 1139  2288]]
Confusion matrix for label identity_hate:
[[63266     0]
 [  712     0]]
Logarithmic Loss:  0.2944793280776893
ROC AUC score:  0.9694666832859243


### Baseline + GloVe + dropout

In [211]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model_refer = Model(inputs=inp, outputs=x)
model_refer.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_refer.fit(train_x,train_y, batch_size=32, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f86f92f0be0>

In [216]:
y_pred_refer_prob = model_refer.predict(test_x, batch_size=1024, verbose=1)
y_pred_refer = multi_label(y_pred_refer_prob)
get_evaluation_score(test_y, y_pred_refer,y_pred_refer_prob)

Accuracy score:  0.8734877614179875
Precision score:  0.5852391957873924
Recall score:  0.7413436336046352
F1 score:  0.6406206372353421
Confusion matrix for label toxic:
[[53485  4403]
 [  772  5318]]
Confusion matrix for label severe_toxic:
[[63475   136]
 [  247   120]]
Confusion matrix for label obscene:
[[58708  1579]
 [  894  2797]]
Confusion matrix for label threat:
[[63765     2]
 [  211     0]]
Confusion matrix for label insult:
[[59227  1324]
 [ 1048  2379]]
Confusion matrix for label identity_hate:
[[63173    93]
 [  578   134]]
Logarithmic Loss:  0.28462313995254485
ROC AUC score:  0.9720963960118592


### RNN -> CNN 

 -- Usually CNN -> RNN perform bad, but works well in the other direction

Refit embedding matrix as size = 300

In [243]:
EMBEDDING_FILE = "Data/glove.840B.300d.txt"
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if await self.run_code(code, result, async_=asy):


(-0.005838503, 0.48782194)

In [245]:
embed_size = 300
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_cnn = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_cnn[i] = embedding_vector

In [252]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix_cnn],trainable = True)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
# x = Dense(128, activation='relu')(x)
# x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model_CNN = Model(sequence_input, preds)
model_CNN.compile(loss='binary_crossentropy',optimizer=Adam(learning_rate=1e-3),metrics=['accuracy'])
model_CNN.fit(train_x,train_y, batch_size=128, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f84895d5be0>

In [253]:
y_pred_CNN_prob= model_CNN.predict(test_x, batch_size=1024, verbose=1)
y_pred_CNN= multi_label(y_pred_CNN_prob)
get_evaluation_score(test_y, y_pred_CNN,y_pred_CNN_prob)

Accuracy score:  0.8762387070555503
Precision score:  0.5996383419281381
Recall score:  0.7658297696233963
F1 score:  0.6613835942374542
Confusion matrix for label toxic:
[[53560  4328]
 [  760  5330]]
Confusion matrix for label severe_toxic:
[[63415   196]
 [  236   131]]
Confusion matrix for label obscene:
[[58300  1987]
 [  727  2964]]
Confusion matrix for label threat:
[[63717    50]
 [  139    72]]
Confusion matrix for label insult:
[[59506  1045]
 [ 1120  2307]]
Confusion matrix for label identity_hate:
[[63133   133]
 [  413   299]]
Logarithmic Loss:  0.27760394065215416
ROC AUC score:  0.9738518647368922


# Bi-LSTM on text + numeric features

In [170]:
Y_train = train_features[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

X_train = train_features.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1).values

X_test = test_features.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1).values

Y_test = test_cleaned[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [189]:
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
from keras.layers import concatenate
def make_model():

    input1 = Input(shape=(maxlen,))
    embed = Embedding(max_features, embed_size, weights=[embedding_matrix])(input1)
                                
    # Building LSTM for text features                          
    bi_lstm_1 = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embed)
    bi_lstm_2 = Bidirectional(LSTM(50))(bi_lstm_1)   
    lstm_output =  Model(inputs = input1,outputs = bi_lstm_2)
    
    #Inputting Number features
    input2=Input(shape=(42,))  
    
    # Merging inputs
    x = concatenate([lstm_output.output,input2])

    #x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=[lstm_output.input,input2], outputs=[x])
    return model

In [192]:
combined_model  =  make_model()
combined_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
combined_model.fit([train_x, X_train],train_y, batch_size=32, epochs=5, verbose=1, validation_split=0.1)

Epoch 1/5


2023-03-27 18:19:23.167981: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 18:19:23.171907: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 18:19:23.175312: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-27 18:31:02.120115: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 18:31:02.125525: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 18:31:02.134677: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8701d10580>

In [223]:
predictions_combine_prob = combined_model.predict([test_x, X_test], batch_size=batch_size, verbose=1)
y_pred_combine = multi_label(predictions_combine_prob )
get_evaluation_score(test_y, y_pred_combine,predictions_combine_prob) 

Accuracy score:  0.04535934227390666
Precision score:  0.33824720495435595
Recall score:  0.7477583114912402
F1 score:  0.3848662598937651
Confusion matrix for label toxic:
[[  244 57644]
 [   98  5992]]
Confusion matrix for label severe_toxic:
[[63560    51]
 [  361     6]]
Confusion matrix for label obscene:
[[58397  1890]
 [ 1113  2578]]
Confusion matrix for label threat:
[[63431   336]
 [  202     9]]
Confusion matrix for label insult:
[[58713  1838]
 [ 1392  2035]]
Confusion matrix for label identity_hate:
[[63045   221]
 [  491   221]]
Logarithmic Loss:  0.2998149186275459
ROC AUC score:  0.6886416074293975


# LSTM on clean text

In [197]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(x)
x = GlobalMaxPool1D()(x)
#providing range for number of neurons in a hidden layer
x = Dense(units=50, activation="relu")(x)
x = Dropout(0.1)(x)
#output layer
x = Dense(6, activation="sigmoid")(x)
#compiling the model
model_lstm = Model(inputs=inp, outputs=x)
model_lstm.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model_lstm.fit(train_x,train_y, epochs=5, validation_split=0.1, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f870117c2e0>

In [222]:
#Retrain the model with best epoch
model_lstm.fit(train_x,train_y, epochs=1, validation_split=0.1, batch_size = 32)
y_pred_lstm_prob = model_lstm.predict(test_x, batch_size=1024, verbose=1)
y_pred_lstm = multi_label(y_pred_lstm_prob)
get_evaluation_score(test_y, y_pred_lstm,y_pred_lstm_prob)

Accuracy score:  0.876723248616712
Precision score:  0.594981715177469
Recall score:  0.7335494550972548
F1 score:  0.6494588399603419
Confusion matrix for label toxic:
[[53726  4162]
 [  972  5118]]
Confusion matrix for label severe_toxic:
[[63387   224]
 [  225   142]]
Confusion matrix for label obscene:
[[58713  1574]
 [  928  2763]]
Confusion matrix for label threat:
[[63714    53]
 [  176    35]]
Confusion matrix for label insult:
[[59393  1158]
 [ 1190  2237]]
Confusion matrix for label identity_hate:
[[63045   221]
 [  372   340]]
Logarithmic Loss:  0.2859243815956915
ROC AUC score:  0.9678198273545139


# GRU on clean text

More suitable to deal with large data set. Train much faster then LSTM.

In [201]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = GRU(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(x)
x = GlobalMaxPool1D()(x)
#providing range for number of neurons in a hidden layer
x = Dense(units=50, activation="relu")(x)
x = Dropout(0.1)(x)
#output layer
x = Dense(6, activation="sigmoid")(x)
#compiling the model
model_gru = Model(inputs=inp, outputs=x)
model_gru.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model_gru.fit(train_x,train_y, epochs=5, validation_split=0.1, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f86fc8650a0>

In [220]:
#Retrain the model with best epoch
model_gru.fit(train_x,train_y, epochs=2, validation_split=0.1, batch_size = 32)
y_pred_gru_prob = model_gru.predict(test_x, batch_size=1024, verbose=1)
y_pred_gru= multi_label(y_pred_gru_prob)
get_evaluation_score(test_y, y_pred_gru,y_pred_gru_prob)

Accuracy score:  0.8773640939072807
Precision score:  0.5928174365348512
Recall score:  0.7079597185818733
F1 score:  0.6403746681007675
Confusion matrix for label toxic:
[[54014  3874]
 [ 1215  4875]]
Confusion matrix for label severe_toxic:
[[63331   280]
 [  219   148]]
Confusion matrix for label obscene:
[[58891  1396]
 [ 1095  2596]]
Confusion matrix for label threat:
[[63646   121]
 [  118    93]]
Confusion matrix for label insult:
[[59220  1331]
 [ 1211  2216]]
Confusion matrix for label identity_hate:
[[63060   206]
 [  376   336]]
Logarithmic Loss:  0.29300759715357977
ROC AUC score:  0.9622083085436123


# Export models & Predictions

## Models

In [254]:
hypermodel_val_accuracy.save("tuned_val_accuracy")
hypermodel_accuracy.save("tuned_accuracy")
model_baseline.save("Baseline")
model_refer.save("Baseline_GloVe")
model_CNN.save("LSTM_CNN")
model_lstm.save("LSTM")
model_gru.save("GRU")

2023-03-28 00:03:17.154928: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,50]
	 [[{{node while/Placeholder_2}}]]
2023-03-28 00:03:18.225557: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,50]
	 [[{{node while/Placeholder_2}}]]
2023-03-28 00:03:18.353957: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and sh

INFO:tensorflow:Assets written to: tuned_val_accuracy/assets


INFO:tensorflow:Assets written to: tuned_val_accuracy/assets


## Predictions

In [263]:
labels =['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
pd.DataFrame(y_pred_refer_prob,columns=labels).to_csv("Baseline_GloVe_predictions.csv")
pd.DataFrame(y_pred_CNN_prob,columns=labels).to_csv("LSTM_CNN_predictions.csv")