In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

# import helper functions
%run -i helper_functions.py

from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GRU,concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D,GlobalMaxPool2D,Flatten,SpatialDropout1D,Conv1D,GlobalMaxPooling1D,GlobalAveragePooling1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, Sequential
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold

2023-04-01 10:32:26.380495: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Read files

In [2]:
train_features = pd.read_csv("./Data/selected_train.csv")
test_features = pd.read_csv("./Data/selected_test.csv")
cleaned_train = pd.read_csv("./Data/cleaned_train.csv")

In [3]:
# Define target
labels =['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#Clean texts of test data 
test = pd.read_csv("./Data/test.csv")
test_labels = pd.read_csv("Data/test_labels.csv")
print(test_labels.shape)

# join text data and labels
test_labeled = pd.concat([test, test_labels.drop('id', axis=1)], axis=1)
#Drop useless rows
masking = (test_labeled[labels]==-1).sum(axis=1)==0
test_labeled = test_labeled[masking].reset_index(drop=True)

#Clean text
test_cleaned = data_cleaning(test_labeled)

(153164, 7)
removing noise
further cleaning the text


Mainly try out Bidirectional-LSTM, LSTM, and GRU models.
Most LSTM models only train on sequential texts. 

# Bi-LSTM on clean text

## Data Preparation

### Define parameters

In [4]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

### Tokenize and sequence train and test clean texts

In [5]:
train_y = train_features[labels].values

train_x = cleaned_train['clean_text']

test_x = test_cleaned['clean_text']

# Vectorize text and sequence texts
tokenizer = Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(train_x))

train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

train_x = pad_sequences(train_x, maxlen=maxlen)
test_x = pad_sequences(test_x, maxlen=maxlen)
test_y = test_cleaned[labels].values


### GloVe Embedding matrix in 50 dimensions

Refer to available embedding data: https://www.kaggle.com/datasets/watts2/glove6b50dtxt

In [6]:
EMBEDDING_FILE = "Data/glove.6B.50d.txt"
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [7]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

### Transform predicted probability into labels

In [8]:
# For those above threshold, could be consider as 1
def multi_label(predictions):
    threshold = 0.5
    y_pred=[]
    for sample in predictions:
        y_pred.append([1 if i>=0.5 else 0 for i in sample ] )
    y_pred = np.array(y_pred)
    return y_pred

### K-fold cross validation function

In [9]:
# Merge inputs and targets
inputs = train_x
targets = train_y

# Define the K-fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True)

In [10]:
scores_label = ['accuracy','precision_weighted','recall_weighted','f1_weighted','log loss','ROC AUC']

In [14]:
def cross_validate(model_build,batch_size,no_epochs):   
    # Define per-fold score containers
    scores = []
    fold_no = 0
    for train, test in kfold.split(inputs, targets):
        model = model_build()
        # Generate a print
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')

        # Fit data to model
        history = model.fit(inputs[train], targets[train],
                  batch_size=batch_size,
                  epochs=no_epochs,
                  verbose=1)

        # Generate generalization metrics
        prediction_prob = model.predict(inputs[test],batch_size=1024, verbose=1)
        prediction_label = multi_label(prediction_prob)
        scores_fold = get_return_val_score(targets[test], prediction_label, prediction_prob)
        
        scores.append(scores_fold)
        fold_no += 1
    return pd.DataFrame(scores, columns = scores_label)


## Hyperparameter tuning

### Define inital model and tuner

In [151]:
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
def build_model(hp):          #hp means hyper parameters
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    #providing range for number of neurons in a hidden layer
    x = Dense(units=hp.Int('num_of_neurons1',min_value=20,max_value=60,step=10), activation="relu")(x)
    x = Dropout(0.1)(x)
    #output layer
    x = Dense(6, activation="sigmoid")(x)
    #compiling the model
    model = Model(inputs=inp, outputs=x)
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate',values=[1e-2, 1e-3, 1e-4])),loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [152]:
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy', #avoid overfitting
                     max_epochs=5,
                     factor=3,
                     directory='tuner',
                     project_name='LSTM')
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

INFO:tensorflow:Reloading Tuner from tuner/LSTM/tuner0.json


### Search

In [32]:
tuner.search(train_x,train_y,epochs=5, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('num_of_neurons1')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Trial 24 Complete [01h 29m 52s]
val_accuracy: 0.9939842224121094

Best val_accuracy So Far: 0.9940468668937683
Total elapsed time: 12h 56m 53s
INFO:tensorflow:Oracle triggered exit

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 20 and the optimal learning rate for the optimizer
is 0.001.



### Build model with best params

In [153]:
# Build and train model with best params
model_tuned = tuner.hypermodel.build(best_hps)
history = model_tuned.fit(train_x,train_y, epochs=5, validation_split=0.1, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [165]:
# fine best model using different scoring
def build_best_model(score):
    val_acc_per_epoch = history.history[score]
    best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
    print('Best epoch: %d' % (best_epoch,))
    hypermodel = tuner.hypermodel.build(best_hps)

    # Retrain the model
    hypermodel.fit(train_x,train_y, epochs=best_epoch, validation_split=0.1, batch_size = 32)
    return hypermodel

#### Using accuracy of validation data to build model

In [154]:
hypermodel_val_accuracy = build_best_model('val_accuracy')

Best epoch: 1


<keras.callbacks.History at 0x7f87094b5550>

In [218]:
predictions_val_accuracy_prob = hypermodel_val_accuracy.predict(test_x,batch_size=1024, verbose=1)
y_pred_val_accuracy = multi_label(predictions_val_accuracy_prob)
get_evaluation_score(test_y, y_pred_val_accuracy,predictions_val_accuracy_prob ) 

Accuracy score:  0.8687986495357779
Precision score:  0.5582364744199816
Recall score:  0.737550006897503
F1 score:  0.612590956322527
Confusion matrix for label toxic:
[[53211  4677]
 [  738  5352]]
Confusion matrix for label severe_toxic:
[[63544    67]
 [  308    59]]
Confusion matrix for label obscene:
[[58467  1820]
 [  750  2941]]
Confusion matrix for label threat:
[[63767     0]
 [  211     0]]
Confusion matrix for label insult:
[[58952  1599]
 [ 1088  2339]]
Confusion matrix for label identity_hate:
[[63264     2]
 [  710     2]]


  _warn_prf(average, modifier, msg_start, len(result))


Logarithmic Loss:  0.29339151012522535
ROC AUC score:  0.970040392088635


##### Cross Validation

In [22]:
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    #providing range for number of neurons in a hidden layer
    x = Dense(units=20, activation="relu")(x)
    x = Dropout(0.1)(x)
    #output layer
    x = Dense(6, activation="sigmoid")(x)
    #compiling the model
    hypermodel_val_accuracy_val = Model(inputs=inp, outputs=x)
    hypermodel_val_accuracy_val.compile(optimizer=keras.optimizers.Adam(0.001),loss='binary_crossentropy',metrics=['accuracy'])
    return hypermodel_val_accuracy_val

In [24]:
hypermodel_val_accuracy_scores = cross_validate(get_model, 32, 1)

------------------------------------------------------------------------
Training for fold 1 ...
------------------------------------------------------------------------
Training for fold 2 ...
------------------------------------------------------------------------
Training for fold 3 ...
------------------------------------------------------------------------
Training for fold 4 ...
------------------------------------------------------------------------
Training for fold 5 ...


In [25]:
hypermodel_val_accuracy_scores

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log loss,ROC AUC
0,0.918283,0.750791,0.674386,0.708539,0.282706,0.978676
1,0.922981,0.771447,0.675351,0.708671,0.280468,0.980468
2,0.91875,0.729379,0.697317,0.701587,0.282518,0.980627
3,0.92157,0.72354,0.688685,0.703765,0.281063,0.9803
4,0.918092,0.774881,0.655531,0.698066,0.291335,0.980409


In [26]:
hypermodel_val_accuracy_scores.mean(axis=0)

accuracy              0.919935
precision_weighted    0.750007
recall_weighted       0.678254
f1_weighted           0.704125
log loss              0.283618
ROC AUC               0.980096
dtype: float64

#### Using accuracy of train data to build model

In [159]:
hypermodel_accuracy =build_best_model('accuracy')

Best epoch: 2
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8714ed4e80>

In [219]:
predictions_accuracy_prob = hypermodel_accuracy.predict(test_x,batch_size=1024, verbose=1)
y_pred_accuracy = multi_label(predictions_accuracy_prob)
get_evaluation_score(test_y, y_pred_accuracy,predictions_accuracy_prob) 

Accuracy score:  0.8744412141673701
Precision score:  0.5675640777774101
Recall score:  0.7634846185680784
F1 score:  0.6452080649904857
Confusion matrix for label toxic:
[[53621  4267]
 [  839  5251]]
Confusion matrix for label severe_toxic:
[[63362   249]
 [  253   114]]
Confusion matrix for label obscene:
[[58559  1728]
 [  827  2864]]
Confusion matrix for label threat:
[[63767     0]
 [  211     0]]
Confusion matrix for label insult:
[[58725  1826]
 [  860  2567]]
Confusion matrix for label identity_hate:
[[63106   160]
 [  439   273]]


  _warn_prf(average, modifier, msg_start, len(result))


Logarithmic Loss:  0.2869767948817004
ROC AUC score:  0.9709464303607276


##### Cross Validation

In [27]:
hypermodel_accuracy_scores = cross_validate(get_model, 32, 2)

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 4 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 5 ...
Epoch 1/2
Epoch 2/2


In [28]:
hypermodel_accuracy_scores

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log loss,ROC AUC
0,0.9222,0.815165,0.685314,0.719875,0.276402,0.982472
1,0.923827,0.752997,0.699233,0.719509,0.267611,0.981879
2,0.919158,0.801813,0.67806,0.720664,0.286807,0.981997
3,0.920442,0.771649,0.699274,0.721499,0.277304,0.983149
4,0.92414,0.823306,0.655829,0.718701,0.273626,0.982027


In [29]:
hypermodel_accuracy_scores.mean(axis=0)

accuracy              0.921953
precision_weighted    0.792986
recall_weighted       0.683542
f1_weighted           0.720049
log loss              0.276350
ROC AUC               0.982305
dtype: float64

## Reference

Some excellent problem solvers on Kaggle provide various solution for this problem. Below models are selected since the output has better performance.

### Baseline Bi-LSTM

Refer to: https://www.kaggle.com/code/CVxTz/keras-bidirectional-lstm-baseline-lb-0-069?scriptVersionId=2188777

In [225]:
embed_size = 128
inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model_baseline = Model(inputs=inp, outputs=x)
model_baseline.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
model_baseline.fit(train_x,train_y, batch_size=32, epochs=2, validation_split=0.1)

2023-03-27 22:25:43.974805: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:25:43.979920: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:25:43.987948: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/2


2023-03-27 22:25:47.070846: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:25:47.079023: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:25:47.087417: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-27 22:31:21.101889: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:31:21.108709: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:31:21.119417: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


<keras.callbacks.History at 0x7f871946b3a0>

In [226]:
y_pred_baseline_prob = model_baseline.predict(test_x, batch_size=1024, verbose=1)
y_pred_baseline = multi_label(y_pred_baseline_prob)
get_evaluation_score(test_y, y_pred_baseline,y_pred_baseline_prob)

2023-03-27 22:36:25.612053: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 22:36:25.616619: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 22:36:25.622575: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Accuracy score:  0.881693707211854
Precision score:  0.5631742457657742
Recall score:  0.6949924127465857


  _warn_prf(average, modifier, msg_start, len(result))


F1 score:  0.6192141609992836
Confusion matrix for label toxic:
[[54273  3615]
 [ 1020  5070]]
Confusion matrix for label severe_toxic:
[[63610     1]
 [  367     0]]
Confusion matrix for label obscene:
[[58845  1442]
 [  973  2718]]
Confusion matrix for label threat:
[[63767     0]
 [  211     0]]
Confusion matrix for label insult:
[[59272  1279]
 [ 1139  2288]]
Confusion matrix for label identity_hate:
[[63266     0]
 [  712     0]]
Logarithmic Loss:  0.2944793280776893
ROC AUC score:  0.9694666832859243


##### Cross Validation

In [12]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model_baseline = Model(inputs=inp, outputs=x)
    model_baseline.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])
    return model_baseline

In [15]:
baseline_scores = cross_validate(get_model, 32,2)

2023-03-31 19:02:40.703502: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:02:40.705703: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:02:40.706964: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

------------------------------------------------------------------------
Training for fold 0 ...
Epoch 1/2


2023-03-31 19:02:41.452910: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:02:41.454292: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:02:41.455820: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


2023-03-31 19:09:56.182401: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:09:56.183617: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:09:56.185069: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-31 19:10:05.732970: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:10:05.734180: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:10:05.735604: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/2


2023-03-31 19:10:06.234352: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:10:06.236456: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:10:06.238515: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


2023-03-31 19:17:07.141387: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:17:07.142960: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:17:07.144076: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-31 19:17:17.143757: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:17:17.145516: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:17:17.146664: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/2


2023-03-31 19:17:17.659032: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:17:17.660567: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:17:17.661808: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


2023-03-31 19:24:17.213727: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:24:17.215301: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:24:17.216575: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-31 19:24:26.160590: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:24:26.162535: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:24:26.164108: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/2


2023-03-31 19:24:26.626765: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:24:26.628946: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:24:26.630462: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


2023-03-31 19:31:50.005526: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:31:50.007112: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:31:50.008259: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-31 19:31:59.409019: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:31:59.410075: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:31:59.412149: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

------------------------------------------------------------------------
Training for fold 4 ...
Epoch 1/2


2023-03-31 19:31:59.893794: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:31:59.895998: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:31:59.897533: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


2023-03-31 19:39:09.821192: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-31 19:39:09.822556: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-31 19:39:09.823876: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [16]:
baseline_scores

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log loss,ROC AUC
0,0.921949,0.79441,0.655538,0.709975,0.282861,0.978156
1,0.920066,0.789504,0.653974,0.713975,0.289893,0.979424
2,0.924359,0.782852,0.671756,0.719105,0.272757,0.980271
3,0.92273,0.785992,0.665005,0.719448,0.283488,0.978786
4,0.918092,0.801615,0.681245,0.717854,0.275867,0.978328


In [17]:
baseline_scores.mean(axis=0)

accuracy              0.921439
precision_weighted    0.790875
recall_weighted       0.665504
f1_weighted           0.716071
log loss              0.280973
ROC AUC               0.978993
dtype: float64

### Baseline model with GloVe embedding matrix

Improve on embedding matrix and add drop out rate. Refer to: https://www.kaggle.com/code/jhoward/improved-lstm-baseline-glove-dropout/notebook

In [211]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model_refer = Model(inputs=inp, outputs=x)
model_refer.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_refer.fit(train_x,train_y, batch_size=32, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f86f92f0be0>

In [216]:
y_pred_refer_prob = model_refer.predict(test_x, batch_size=1024, verbose=1)
y_pred_refer = multi_label(y_pred_refer_prob)
get_evaluation_score(test_y, y_pred_refer,y_pred_refer_prob)

Accuracy score:  0.8734877614179875
Precision score:  0.5852391957873924
Recall score:  0.7413436336046352
F1 score:  0.6406206372353421
Confusion matrix for label toxic:
[[53485  4403]
 [  772  5318]]
Confusion matrix for label severe_toxic:
[[63475   136]
 [  247   120]]
Confusion matrix for label obscene:
[[58708  1579]
 [  894  2797]]
Confusion matrix for label threat:
[[63765     2]
 [  211     0]]
Confusion matrix for label insult:
[[59227  1324]
 [ 1048  2379]]
Confusion matrix for label identity_hate:
[[63173    93]
 [  578   134]]
Logarithmic Loss:  0.28462313995254485
ROC AUC score:  0.9720963960118592


##### Cross Validation

In [22]:
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model_refer = Model(inputs=inp, outputs=x)
    model_refer.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_refer

In [23]:
baseline_glove_scores = cross_validate(get_model, 32,2)

------------------------------------------------------------------------
Training for fold 0 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 4 ...
Epoch 1/2
Epoch 2/2


In [24]:
baseline_glove_scores

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log loss,ROC AUC
0,0.919066,0.77145,0.729597,0.748744,0.277894,0.982075
1,0.923169,0.82159,0.660183,0.727372,0.274133,0.982958
2,0.921288,0.815675,0.672527,0.729615,0.27837,0.982621
3,0.923639,0.774349,0.737703,0.752292,0.268568,0.981193
4,0.920066,0.782347,0.716602,0.732432,0.273222,0.98285


In [25]:
baseline_glove_scores.mean(axis=0)

accuracy              0.921446
precision_weighted    0.793082
recall_weighted       0.703322
f1_weighted           0.738091
log loss              0.274437
ROC AUC               0.982339
dtype: float64

### Combind LSTM with CNN 

Usually CNN -> RNN perform bad, but works well in the other direction. Refer to: ​​https://www.kaggle.com/code/eashish/bidirectional-gru-with-convolution

In [9]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = True)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
# x = Dense(128, activation='relu')(x)
preds = Dense(6, activation="sigmoid")(x)# x = Dropout(0.1)(x)

model_CNN = Model(sequence_input, preds)
model_CNN.compile(loss='binary_crossentropy',optimizer=Adam(learning_rate=1e-3),metrics=['accuracy'])
model_CNN.fit(train_x,train_y, batch_size=128, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa901766ca0>

In [15]:
y_pred_CNN_prob= model_CNN.predict(test_x, batch_size=1024, verbose=1)
y_pred_CNN= multi_label(y_pred_CNN_prob)
get_overall_evaluation_score(test_y, 
                     y_pred_CNN)

Accuracy score:  0.8855700397011472
Precision score:  0.5914517377548998
Recall score:  0.7372051317423093
F1 score:  0.6490358184998823


##### Cross Validation

In [29]:
def get_model():
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = True)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = Dense(6, activation="sigmoid")(x)
    model_CNN = Model(sequence_input, preds)
    model_CNN.compile(loss='binary_crossentropy',optimizer=Adam(learning_rate=1e-3),metrics=['accuracy'])
    return model_CNN

In [30]:
baseline_CNN_scores = cross_validate(get_model, 128,2)

------------------------------------------------------------------------
Training for fold 0 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/2
Epoch 2/2
------------------------------------------------------------------------
Training for fold 4 ...
Epoch 1/2
Epoch 2/2


In [31]:
baseline_CNN_scores

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log loss,ROC AUC
0,0.923829,0.832285,0.645101,0.715094,0.274026,0.980717
1,0.923325,0.807687,0.677735,0.732476,0.269611,0.981297
2,0.920693,0.818019,0.656614,0.718863,0.285675,0.981105
3,0.922981,0.816377,0.665692,0.725706,0.270127,0.980814
4,0.922981,0.814566,0.681387,0.733952,0.276026,0.981744


In [32]:
baseline_CNN_scores.mean(axis=0)

accuracy              0.922762
precision_weighted    0.817787
recall_weighted       0.665306
f1_weighted           0.725218
log loss              0.275093
ROC AUC               0.981135
dtype: float64

# Bi-LSTM on text + numeric features

Adding numeric features generally decrease the model performance a lot.

In [170]:
Y_train = train_features[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

X_train = train_features.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1).values

X_test = test_features.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1).values

Y_test = test_cleaned[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [189]:
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
from keras.layers import concatenate
def make_model():

    input1 = Input(shape=(maxlen,))
    embed = Embedding(max_features, embed_size, weights=[embedding_matrix])(input1)
                                
    # Building LSTM for text features                          
    bi_lstm_1 = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embed)
    bi_lstm_2 = Bidirectional(LSTM(50))(bi_lstm_1)   
    lstm_output =  Model(inputs = input1,outputs = bi_lstm_2)
    
    #Inputting Number features
    input2=Input(shape=(42,))  
    
    # Merging inputs
    x = concatenate([lstm_output.output,input2])

    #x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=[lstm_output.input,input2], outputs=[x])
    return model

In [192]:
combined_model  =  make_model()
combined_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
combined_model.fit([train_x, X_train],train_y, batch_size=32, epochs=5, verbose=1, validation_split=0.1)

Epoch 1/5


2023-03-27 18:19:23.167981: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 18:19:23.171907: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 18:19:23.175312: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-27 18:31:02.120115: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-27 18:31:02.125525: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-27 18:31:02.134677: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8701d10580>

In [223]:
predictions_combine_prob = combined_model.predict([test_x, X_test], batch_size=32, verbose=1)
y_pred_combine = multi_label(predictions_combine_prob )
get_evaluation_score(test_y, y_pred_combine,predictions_combine_prob) 

Accuracy score:  0.04535934227390666
Precision score:  0.33824720495435595
Recall score:  0.7477583114912402
F1 score:  0.3848662598937651
Confusion matrix for label toxic:
[[  244 57644]
 [   98  5992]]
Confusion matrix for label severe_toxic:
[[63560    51]
 [  361     6]]
Confusion matrix for label obscene:
[[58397  1890]
 [ 1113  2578]]
Confusion matrix for label threat:
[[63431   336]
 [  202     9]]
Confusion matrix for label insult:
[[58713  1838]
 [ 1392  2035]]
Confusion matrix for label identity_hate:
[[63045   221]
 [  491   221]]
Logarithmic Loss:  0.2998149186275459
ROC AUC score:  0.6886416074293975


# LSTM on clean text

Generally bidirectional LSTM is better than only has forward propagation LSTM

In [197]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(x)
x = GlobalMaxPool1D()(x)
#providing range for number of neurons in a hidden layer
x = Dense(units=50, activation="relu")(x)
x = Dropout(0.1)(x)
#output layer
x = Dense(6, activation="sigmoid")(x)
#compiling the model
model_lstm = Model(inputs=inp, outputs=x)
model_lstm.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model_lstm.fit(train_x,train_y, epochs=5, validation_split=0.1, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f870117c2e0>

In [222]:
#Retrain the model with best epoch
model_lstm.fit(train_x,train_y, epochs=1, validation_split=0.1, batch_size = 32)
y_pred_lstm_prob = model_lstm.predict(test_x, batch_size=1024, verbose=1)
y_pred_lstm = multi_label(y_pred_lstm_prob)
get_evaluation_score(test_y, y_pred_lstm,y_pred_lstm_prob)

Accuracy score:  0.876723248616712
Precision score:  0.594981715177469
Recall score:  0.7335494550972548
F1 score:  0.6494588399603419
Confusion matrix for label toxic:
[[53726  4162]
 [  972  5118]]
Confusion matrix for label severe_toxic:
[[63387   224]
 [  225   142]]
Confusion matrix for label obscene:
[[58713  1574]
 [  928  2763]]
Confusion matrix for label threat:
[[63714    53]
 [  176    35]]
Confusion matrix for label insult:
[[59393  1158]
 [ 1190  2237]]
Confusion matrix for label identity_hate:
[[63045   221]
 [  372   340]]
Logarithmic Loss:  0.2859243815956915
ROC AUC score:  0.9678198273545139


# GRU on clean text

More suitable to deal with large data set. Train much faster then LSTM.

In [201]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = GRU(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(x)
x = GlobalMaxPool1D()(x)
#providing range for number of neurons in a hidden layer
x = Dense(units=50, activation="relu")(x)
x = Dropout(0.1)(x)
#output layer
x = Dense(6, activation="sigmoid")(x)
#compiling the model
model_gru = Model(inputs=inp, outputs=x)
model_gru.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model_gru.fit(train_x,train_y, epochs=5, validation_split=0.1, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f86fc8650a0>

In [220]:
#Retrain the model with best epoch
model_gru.fit(train_x,train_y, epochs=2, validation_split=0.1, batch_size = 32)
y_pred_gru_prob = model_gru.predict(test_x, batch_size=1024, verbose=1)
y_pred_gru= multi_label(y_pred_gru_prob)
get_evaluation_score(test_y, y_pred_gru,y_pred_gru_prob)

Accuracy score:  0.8773640939072807
Precision score:  0.5928174365348512
Recall score:  0.7079597185818733
F1 score:  0.6403746681007675
Confusion matrix for label toxic:
[[54014  3874]
 [ 1215  4875]]
Confusion matrix for label severe_toxic:
[[63331   280]
 [  219   148]]
Confusion matrix for label obscene:
[[58891  1396]
 [ 1095  2596]]
Confusion matrix for label threat:
[[63646   121]
 [  118    93]]
Confusion matrix for label insult:
[[59220  1331]
 [ 1211  2216]]
Confusion matrix for label identity_hate:
[[63060   206]
 [  376   336]]
Logarithmic Loss:  0.29300759715357977
ROC AUC score:  0.9622083085436123


# Export models & Predictions

## Models

In [16]:
hypermodel_val_accuracy.save("tuner/LSTM/tuned_val_accuracy")
hypermodel_accuracy.save("tuner/LSTM/tuned_accuracy")
model_baseline.save("tuner/LSTM/Baseline")
model_refer.save("tuner/LSTM/Baseline_GloVe")
model_CNN.save("tuner/LSTM/LSTM_CNN")
model_lstm.save("tuner/LSTM/LSTM")
model_gru.save("tuner/LSTM/GRU")

2023-04-01 11:18:38.040049: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,?]
	 [[{{node inputs}}]]
2023-04-01 11:18:38.189960: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,128]
	 [[{{node while/Placeholder_2}}]]
2023-04-01 11:18:38.309382: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,128]

2023-04-01 11:18:40.540209: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,100,50]
	 [[{{node inputs}}]]
2023-04-01 11:18:40.552623: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,100,50]
	 [[{{node inputs}}]]
2023-04-01 11:18:41.435594: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,?]
	 [[{{node inputs}}]]
2023-04-01

INFO:tensorflow:Assets written to: tuner/LSTM/LSTM_CNN/assets


INFO:tensorflow:Assets written to: tuner/LSTM/LSTM_CNN/assets


## Predictions

In [17]:
labels =['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
pd.DataFrame(y_pred_refer_prob,columns=labels).to_csv("prediction/LSTM/Baseline_GloVe_predictions.csv")
pd.DataFrame(y_pred_CNN_prob,columns=labels).to_csv("prediction/LSTM/LSTM_CNN_predictions.csv")

# Evaluation of each category

In [17]:
# For those above threshold, could be consider as 1
def multi_label(predictions):
    threshold = 0.5
    y_pred=[]
    for sample in predictions:
        y_pred.append([1 if i>=0.5 else 0 for i in sample ] )
    y_pred = np.array(y_pred)
    return y_pred

In [24]:
hypermodel_val_accuracy = keras.models.load_model("./Models/tuned_val_accuracy")
hypermodel_accuracy = keras.models.load_model("./Models/tuned_accuracy")
model_baseline= keras.models.load_model("./Models/Baseline")
model_refer= keras.models.load_model("./Models/Baseline_GloVe")
model_CNN = keras.models.load_model("./Models/LSTM_CNN")
model_lstm= keras.models.load_model("./Models/LSTM")
model_gru= keras.models.load_model("./Models/GRU")

2023-03-28 12:43:16.655212: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients_reversev2_grad_reversev2_reversev2_axis' with dtype int32 and shape [1]
	 [[{{node gradients_reversev2_grad_reversev2_reversev2_axis}}]]
2023-03-28 12:43:16.657691: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients_split_2_grad_concat_split_2_split_dim' with dtype int32
	 [[{{node gradients_split_2_grad_concat_split_2_split_dim}}]]
2023-03-28 12:43:16.657838: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message):

# Baseline model on oversampled data

The oversampled data perform not well in LSTM model

In [61]:
oversampled_data = pd.read_csv("./Data/train_oversampled.csv")

In [63]:

train_y = oversampled_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
train_x = oversampled_data['clean_text']

# Vectorize text + Prepare GloVe Embedding
tokenizer = Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(train_x))

train_x = tokenizer.texts_to_sequences(train_x)
train_x = pad_sequences(train_x, maxlen=maxlen)


In [64]:
embed_size = 128
inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model_baseline_oversampled = Model(inputs=inp, outputs=x)
model_baseline_oversampled .compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
model_baseline_oversampled.fit(train_x,train_y, batch_size=32, epochs=2, validation_split=0.1)

2023-03-28 13:55:48.267007: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-28 13:55:48.270407: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-28 13:55:48.272085: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/2


2023-03-28 13:55:49.328185: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-28 13:55:49.329741: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-28 13:55:49.332427: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-03-28 14:07:12.279982: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-28 14:07:12.281573: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-28 14:07:12.283028: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


<keras.callbacks.History at 0x7fe1a41a1c10>

In [73]:
y_pred_baseline_oversampled_prob = model_baseline_oversampled.predict(test_x, batch_size=1024, verbose=1)
y_pred_baseline_oversampled= multi_label(y_pred_baseline_oversampled_prob)
get_evaluation_score(test_y, y_pred_baseline_oversampled,y_pred_baseline_oversampled_prob)

Accuracy score:  0.15424052017881146
Precision score:  0.05558650586003321
Recall score:  0.5552489998620499
F1 score:  0.10074835475292736
Logarithmic Loss:  0.4906917475329344
ROC AUC score:  0.3709438143484407
