In [1]:
import pandas as pd
import numpy as np

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from autocorrect import spell

from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.utils import class_weight

import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, LSTM, Embedding, Input, GlobalMaxPool1D
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


### Create feature spaces

In [3]:
toxic = pd.read_csv('train_cleaned.csv')

In [4]:
toxic_test = pd.read_csv('test_cleaned.csv')

**Drop NA**

In [5]:
toxic.describe(include='all')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed,comment_text_clean
count,159571,159571,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571,159571,159521
unique,159571,159305,,,,,,,158250,158225,157648
top,da8b4951ee7f4f03,jun utc,,,,,,,['january'],['januari'],januari
freq,1,11,,,,,,,21,21,22
mean,,,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,,,
std,,,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,,,
min,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
25%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
50%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
75%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [6]:
toxic.dropna(axis=0, inplace=True)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(toxic.loc[:,'comment_text_clean'], toxic.iloc[:,2:8], test_size = .2, random_state = 43)

In [8]:
x_train.shape

(127616,)

In [9]:
x_submission = toxic_test.loc[:,'comment_text_clean']

In [10]:
x_submission = x_submission.fillna(' ')

In [11]:
type(x_submission)

pandas.core.series.Series

In [12]:
max_features = 1000

In [13]:
#TF-IDF Vectors as features

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=max_features)
tfidf_vect.fit(x_train)
x_train_tfidf =  tfidf_vect.transform(x_train)
x_test_tfidf =  tfidf_vect.transform(x_test)

In [14]:
x_submission_tfidf = tfidf_vect.transform(x_submission)

In [15]:
x_train_tfidf_os_all = []
y_train_tfidf_os_all = []


for i in range(6):   
    sm_tfidf = RandomOverSampler(random_state=40)
    x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:,i])
    x_train_tfidf_os_all.append(x_train_tfidf_os)
    y_train_tfidf_os_all.append(y_train_tfidf_os)

### Neural Network

In [16]:
data_dim = max_features
timesteps = 1
model = Sequential()
model.add(LSTM(64, input_shape=(timesteps, data_dim), return_sequences=True))
  
model.add(Dropout(0.5))

model.add(Flatten()) 

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(1))
model.add(Activation('sigmoid'))

In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1, 64)             272640    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
__________

In [None]:
prediction_test = []
prediction_submission = []

In [None]:
x_test_tfidf = x_test_tfidf.toarray().reshape(x_test_tfidf.shape[0], 1, x_test_tfidf.shape[1])
x_submission_tfidf = x_submission_tfidf.toarray().reshape(x_submission_tfidf.shape[0], 1, x_submission_tfidf.shape[1])

In [None]:
for i in range(6):
    x_train_tfidf = x_train_tfidf_os_all[i]
    x_train_tfidf = x_train_tfidf.toarray().reshape(x_train_tfidf.shape[0], 1, x_train_tfidf.shape[1])
    history = model.fit(x_train_tfidf, y_train_tfidf_os_all[i],
              batch_size=128, epochs=30,
              verbose=1,
              validation_split=0.1)
    model.save('my_model' + str(i) +'.h5')
    prediction_test.append(model.predict_proba(x_test_tfidf))
    prediction_submission.append(model.predict_proba(x_submission_tfidf))

Train on 207637 samples, validate on 23071 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train on 227395 samples, validate on 25267 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30


Epoch 29/30
Epoch 30/30
Train on 217483 samples, validate on 24165 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train on 228994 samples, validate on 25444 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30


Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train on 218282 samples, validate on 24254 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

In [None]:
prediction_test

In [None]:
prediction_submission

In [None]:
prediction_submission_array = np.asarray(prediction_submission).reshape(6, 153164).transpose()

In [None]:
prediction_submission_array.shape

In [None]:
prediction_submission_array

In [None]:
submission = pd.DataFrame(data=prediction_submission_array,columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'], index=toxic_test['id'])


In [None]:
submission

In [None]:
submission.to_csv('submission_1501.csv', index=True)

In [None]:
[roc_auc_score(y_test.iloc[:,i], prediction_test[i]) for i in range(6)]

In [None]:
sum([roc_auc_score(y_test.iloc[:,i], prediction_test[i]) for i in range(6)]) / 6

In [None]:
print(classification_report(y_test.iloc[:,0], prediction_test[0] > 0.92))