# MsCA 31009 - Machine Learning and Predictive Analytics

## Project - Toxic Comment Classification

## Import files and libraries.

In [None]:
!pip3 install autocorrect
!pip3 install nltk
!pip3 install imblearn
!pip3 install keras

In [2]:
import pandas as pd
import numpy as np

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from autocorrect import spell

from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.utils import class_weight

import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, LSTM, Embedding, Input, GlobalMaxPool1D
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


**Download train data.**

In [None]:
toxic = pd.read_csv('train.csv')

In [None]:
toxic_test = pd.read_csv('test.csv')

In [None]:
toxic_test.head()

## Data Preprocessing

### Text Cleaning

In [None]:
toxic.head(10)

**Remove non-alphabet characters**

In [None]:
toxic['comment_text'] = [re.sub('[^A-Za-z]', ' ', i).lower() for i in toxic['comment_text']]

In [None]:
toxic_test['comment_text'] = [re.sub('[^A-Za-z]', ' ', i).lower() for i in toxic_test['comment_text']]

**Tokenization**

In [None]:
toxic['comment_text_tokenize'] = [word_tokenize(i) for i in toxic['comment_text']]

In [None]:
toxic_test['comment_text_tokenize'] = [word_tokenize(i) for i in toxic_test['comment_text']]

In [None]:
toxic.head()

**Standardize contraction**

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"cant", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

**Stemming**

In [None]:
stemmer = SnowballStemmer('english')
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize']:
    sentence_stemmed = [stemmer.stem(clean_text(word)) for word in sentence]
    stentence_placeholder.append(sentence_stemmed)
toxic['comment_text_tokenize_stemmed'] = stentence_placeholder

In [None]:
stentence_placeholder = []
for sentence in toxic_test.loc[:,'comment_text_tokenize']:
    sentence_stemmed = [stemmer.stem(clean_text(word)) for word in sentence]
    stentence_placeholder.append(sentence_stemmed)
toxic_test['comment_text_tokenize_stemmed'] = stentence_placeholder

**Stopwords Removal**

In [None]:
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize_stemmed']:
    sentence_clean = [word for word in sentence if word not in stopwords.words('english')]
    stentence_placeholder.append(sentence_clean)
toxic['comment_text_clean'] = stentence_placeholder
toxic['comment_text_clean'] = [' '.join(i) for i in toxic['comment_text_clean']]

In [None]:
f1_scorestentence_placeholder = []
for sentence in toxic_test.loc[:,'comment_text_tokenize_stemmed']:
    sentence_clean = [word for word in sentence if word not in stopwords.words('english')]
    stentence_placeholder.append(sentence_clean)
toxic_test['comment_text_clean'] = stentence_placeholder
toxic_test['comment_text_clean'] = [' '.join(i) for i in toxic_test['comment_text_clean']]

In [None]:
toxic

In [None]:
toxic.to_csv('train_cleaned.csv', index=False)

In [None]:
toxic_test.to_csv('test_cleaned.csv', index=False)

### Create feature spaces

In [4]:
toxic = pd.read_csv('train_cleaned.csv')

In [5]:
toxic_test = pd.read_csv('test_cleaned.csv')

**Drop NA**

In [6]:
toxic.describe(include='all')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed,comment_text_clean
count,159571,159571,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571,159571,159521
unique,159571,159305,,,,,,,158250,158225,157648
top,8a186dcddac73a41,jun utc,,,,,,,['january'],['januari'],januari
freq,1,11,,,,,,,21,21,22
mean,,,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,,,
std,,,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,,,
min,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
25%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
50%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,
75%,,,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [7]:
toxic.dropna(axis=0, inplace=True)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(toxic.loc[:,'comment_text_clean'], toxic.iloc[:,2:8], test_size = .3, random_state = 43)

In [9]:
x_train.shape

(111664,)

In [10]:
x_submission = toxic_test.loc[:,'comment_text_clean']

In [11]:
x_submission = x_submission.fillna(' ')

In [12]:
type(x_submission)

pandas.core.series.Series

In [13]:
#TF-IDF Vectors as features

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1000)
tfidf_vect.fit(x_train)
x_train_tfidf =  tfidf_vect.transform(x_train)
x_test_tfidf =  tfidf_vect.transform(x_test)

In [14]:
x_submission_tfidf = tfidf_vect.transform(x_submission)

In [15]:
x_train_tfidf_os_all = []
y_train_tfidf_os_all = []


for i in range(6):   
    sm_tfidf = RandomOverSampler(random_state=40)
    x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:,i])
    x_train_tfidf_os_all.append(x_train_tfidf_os)
    y_train_tfidf_os_all.append(y_train_tfidf_os)

### Neural Network

In [16]:
data_dim = 1000
timesteps = 1
model = Sequential()
model.add(LSTM(64, input_shape=(timesteps, data_dim), return_sequences=True))
  
model.add(Dropout(0.5))

model.add(Flatten()) 

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(1))
model.add(Activation('sigmoid'))

In [17]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1, 64)             272640    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
__________

In [19]:
prediction_test = []
prediction_submission = []

In [20]:
x_test_tfidf = x_test_tfidf.toarray().reshape(x_test_tfidf.shape[0], 1, x_test_tfidf.shape[1])
x_submission_tfidf = x_submission_tfidf.toarray().reshape(x_submission_tfidf.shape[0], 1, x_submission_tfidf.shape[1])

In [21]:
for i in range(6):
    x_train_tfidf = x_train_tfidf_os_all[i]
    x_train_tfidf = x_train_tfidf.toarray().reshape(x_train_tfidf.shape[0], 1, x_train_tfidf.shape[1])
    history = model.fit(x_train_tfidf, y_train_tfidf_os_all[i],
              batch_size=128, epochs=20,
              verbose=1,
              validation_split=0.1)
    model.save('my_model' + str(i) +'.h5')
    prediction_test.append(model.predict_proba(x_test_tfidf))
    prediction_submission.append(model.predict_proba(x_submission_tfidf))

Train on 181528 samples, validate on 20170 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 198966 samples, validate on 22108 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 190200 samples, validate on 21134 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 200365 samples, validate on 22263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoc

Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
prediction_test

[array([[5.5190416e-19],
        [1.6109453e-24],
        [1.6417996e-14],
        ...,
        [5.4007669e-18],
        [1.0000000e+00],
        [3.3381710e-08]], dtype=float32), array([[0.        ],
        [0.        ],
        [0.        ],
        ...,
        [0.        ],
        [0.98083216],
        [0.        ]], dtype=float32), array([[0.       ],
        [0.       ],
        [0.       ],
        ...,
        [0.       ],
        [0.9999995],
        [0.       ]], dtype=float32), array([[0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        ...,
        [0.0000000e+00],
        [1.9591413e-34],
        [0.0000000e+00]], dtype=float32), array([[0.        ],
        [0.        ],
        [0.        ],
        ...,
        [0.        ],
        [0.99968696],
        [0.        ]], dtype=float32), array([[0.        ],
        [0.        ],
        [0.        ],
        ...,
        [0.        ],
        [0.00040722],
        [0.        ]], dtype=float32)]

In [None]:
prediction_submission

In [43]:
prediction_submission_array = np.asarray(prediction_submission).reshape(6, 153164).transpose()

In [44]:
prediction_submission_array.shape

(153164, 6)

In [45]:
prediction_submission_array

array([[9.9991369e-01, 1.4971228e-08, 9.9985361e-01, 2.4681628e-06,
        9.9993050e-01, 8.0658978e-01],
       [1.0266882e-02, 4.3951426e-21, 0.0000000e+00, 0.0000000e+00,
        4.4095069e-26, 0.0000000e+00],
       [1.7676908e-01, 0.0000000e+00, 5.9478807e-01, 0.0000000e+00,
        9.8777525e-19, 0.0000000e+00],
       ...,
       [1.6229183e-23, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [4.6333018e-19, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [9.9302691e-01, 2.3762806e-36, 2.7801642e-01, 0.0000000e+00,
        1.0262520e-07, 0.0000000e+00]], dtype=float32)

In [54]:
submission = pd.DataFrame(data=prediction_submission_array,columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'], index=toxic_test['id'])


In [55]:
submission

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,9.999137e-01,1.497123e-08,9.998536e-01,2.468163e-06,9.999305e-01,8.065898e-01
0000247867823ef7,1.026688e-02,4.395143e-21,0.000000e+00,0.000000e+00,4.409507e-26,0.000000e+00
00013b17ad220c46,1.767691e-01,0.000000e+00,5.947881e-01,0.000000e+00,9.877752e-19,0.000000e+00
00017563c3f7919a,1.308640e-13,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
00017695ad8997eb,2.358867e-22,0.000000e+00,5.881332e-20,0.000000e+00,1.140444e-35,0.000000e+00
0001ea8717f6de06,1.032862e-14,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
00024115d4cbde0f,4.020327e-11,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
000247e83dcc1211,8.580169e-01,0.000000e+00,1.136559e-06,0.000000e+00,7.640940e-21,0.000000e+00
00025358d4737918,7.900949e-10,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
00026d1092fe71cc,2.008152e-25,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [57]:
submission.to_csv('submission.csv', index=True)

In [70]:
[roc_auc_score(y_test.iloc[:,i], prediction_test[i]) for i in range(6)]

[0.9306707051233531,
 0.9538281585776786,
 0.9561398474092062,
 0.7975416871817973,
 0.9280171997120963,
 0.8327414038235497]

In [35]:
sum([roc_auc_score(y_test.iloc[:,i], prediction_test[i]) for i in range(6)]) / 6

0.8998231669712803

In [69]:
print(classification_report(y_test.iloc[:,0], prediction_test[0] > 0.92))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     43378
           1       0.84      0.60      0.70      4479

   micro avg       0.95      0.95      0.95     47857
   macro avg       0.90      0.79      0.84     47857
weighted avg       0.95      0.95      0.95     47857

