### Install requirements & import libraries

In [1]:
!pip3 install imbalanced-learn



In [2]:
!pip3 install keras



In [3]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.utils import class_weight

import re

In [4]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, LSTM, Embedding, Input, GlobalMaxPool1D
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers

Using TensorFlow backend.


### Data Preprocessing

#### Load data

In [5]:
toxic = pd.read_csv('train_cleaned.csv')

In [6]:
toxic.describe(include='all')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed,comment_text_clean
count,159571,159571,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571,159571,159521
unique,159571,159305,,,,,,,158250,158225,157648
top,a711da4623cebf13,jun utc,,,,,,,['january'],['januari'],januari
freq,1,11,,,,,,,21,21,22
mean,,,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,,,
std,,,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,,,
min,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
25%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
50%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
75%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,


#### Drop NA

In [7]:
toxic.dropna(axis=0, inplace=True)

#### Train-Test Split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(toxic.loc[:,'comment_text_clean'], toxic.iloc[:,2:8], test_size = .3, random_state = 43)

#### Create word corpus (TF-IDF)

In [9]:
max_features = 2000

In [10]:
#TF-IDF Vectors as features

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=max_features)
tfidf_vect.fit(x_train)
x_train_tfidf =  tfidf_vect.transform(x_train)
x_test_tfidf =  tfidf_vect.transform(x_test)

In [11]:
x_train_tfidf_os_all = []
y_train_tfidf_os_all = []


for i in range(6):   
    sm_tfidf = RandomOverSampler(random_state=40)
    x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:,i])
    x_train_tfidf_os_all.append(x_train_tfidf_os)
    y_train_tfidf_os_all.append(y_train_tfidf_os)

### Neural Network Model

In [12]:
data_dim = max_features
timesteps = 1
model = Sequential()
model.add(LSTM(64, input_shape=(timesteps, data_dim), return_sequences=True))
  
model.add(Dropout(0.5))

model.add(Flatten()) 

model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.0022)))
model.add(Dropout(0.5))


model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.0022)))
model.add(Dropout(0.5))

model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.0022)))
model.add(Dropout(0.5))

model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.0022)))
model.add(Dropout(0.5))

model.add(Dense(1))
model.add(Activation('sigmoid'))

In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1, 64)             528640    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
__________

In [15]:
prediction_train = []
prediction_test = []

In [16]:
x_test_tfidf = x_test_tfidf.toarray().reshape(x_test_tfidf.shape[0], 1, x_test_tfidf.shape[1])

In [17]:
for i in range(6):
    callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='best_model' + str(i) +'1500.h5', monitor='val_loss', save_best_only=True)]
    
    x_train_tfidf_os = x_train_tfidf_os_all[i]
    x_train_tfidf_os = x_train_tfidf_os.toarray().reshape(x_train_tfidf_os.shape[0], 1, x_train_tfidf_os.shape[1])
    history = model.fit(x_train_tfidf_os, y_train_tfidf_os_all[i],
              batch_size=128, epochs=50, callbacks=callbacks,
              verbose=1,
              validation_split=0.1)
    model.save('my_model' + str(i) +'1500.h5')
    prediction_train.append(model.predict_proba(x_train_tfidf.toarray().reshape(x_train_tfidf.shape[0], 1, x_train_tfidf.shape[1])))
    prediction_test.append(model.predict_proba(x_test_tfidf))

Train on 181528 samples, validate on 20170 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Train on 198966 samples, validate on 22108 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Train on 190200 samples, validate on 21134 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Train on 200365 samples, validate on 22263 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Train on 190908 samples, validate on 21212 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Train on 199188 samples, validate on 22132 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [18]:
prediction_submission_array = np.asarray(prediction_test).reshape(6, 47857).transpose()

In [19]:
proba = pd.DataFrame(data=prediction_submission_array,columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'], index=x_test.index.values.tolist())

In [20]:
proba.to_csv('submission_1500_max_feature.csv', index=True)

In [21]:
f1_train = []
cutoff = []
for i in range(6):
    best_f1 = 0
    best_val = 0
    for val in np.arange(0,1,0.01):
        if f1_score(y_train.iloc[:,i], prediction_train[i] > val) > best_f1:
            best_f1 = f1_score(y_train.iloc[:,i], prediction_train[i] > val)
            best_val = val
    f1_train.append(best_f1)
    cutoff.append(best_val)

  'precision', 'predicted', average, warn_for)


In [22]:
f1_train

[0.7883842794759826,
 0.5341074020319303,
 0.8575015693659762,
 0.7604395604395605,
 0.7984862819299906,
 0.6981059141863161]

In [23]:
cutoff

[0.75, 0.9400000000000001, 0.89, 0.99, 0.88, 0.97]

In [24]:
f1_test = [f1_score(y_test.iloc[:,i] , prediction_test[i] > cutoff[i]) for i in range(6)]

In [25]:
f1_test

[0.7284487329021312,
 0.4233750745378652,
 0.782760629004077,
 0.364741641337386,
 0.6867804684398571,
 0.4819027921406412]

In [26]:
roc_auc_train = [roc_auc_score(y_train.iloc[:,i], prediction_train[i]) for i in range(6)]

In [27]:
roc_auc_test = [roc_auc_score(y_test.iloc[:,i], prediction_test[i]) for i in range(6)]

In [28]:
roc_auc_train

[0.9796124889516942,
 0.9920669643080401,
 0.995367522499929,
 0.9992089558751435,
 0.9920307539414144,
 0.9971161756372305]

In [29]:
roc_auc_test

[0.9497496068532831,
 0.9723682675106001,
 0.9711142629152563,
 0.9440396889207819,
 0.9596516371703349,
 0.9489697662452096]