# MsCA 31009 - Machine Learning and Predictive Analytics

## Project - Toxic Comment Classification

## Import files and libraries.

In [None]:
#!pip3 install autocorrect

In [1]:
import pandas as pd
import numpy as np

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from autocorrect import spell

from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import re

[nltk_data] Error loading punkt: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>


**Download train data.**

In [None]:
!wget 'https://drive.google.com/uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp'
!unzip -o 'uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp'

In [3]:
toxic = pd.read_csv('train.csv')

## Data Preprocessing

### Text Cleaning

In [4]:
toxic.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


**Remove ID column.**

In [5]:
toxic.drop(['id'], axis=1, inplace=True)

**Remove non-alphabet characters**

In [6]:
toxic['comment_text'] = [re.sub('[^A-Za-z]', ' ', i).lower() for i in toxic['comment_text']]

**Tokenization**

In [7]:
toxic['comment_text_tokenize'] = [word_tokenize(i) for i in toxic['comment_text']]

In [12]:
toxic.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed
0,explanation why the edits made under my userna...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[explan, whi, the, edit, made, under, my, user..."
1,d aww he matches this background colour i m s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour...","[d, aww, he, match, this, background, colour, ..."
2,hey man i m really not trying to edit war it...,0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit...","[hey, man, i, m, realli, not, tri, to, edit, w..."
3,more i can t make any real suggestions on im...,0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions...","[more, i, can, t, make, ani, real, suggest, on..."
4,you sir are my hero any chance you remember...,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[you, sir, are, my, hero, ani, chanc, you, rem..."


**Standardize contraction**

In [13]:
for i in range(6):
    print(confusion_matrix_test_cv_all[i])def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"cant", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

**Stemming**

In [14]:
stemmer = SnowballStemmer('english')
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize']:
    sentence_stemmed = [stemmer.stem(clean_text(word)) for word in sentence]
    stentence_placeholder.append(sentence_stemmed)
toxic['comment_text_tokenize_stemmed'] = stentence_placeholder

**Stopwords Removal**

In [15]:
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize_stemmed']:
    sentence_clean = [word for word in sentence if word not in stopwords.words('english')]
    stentence_placeholder.append(sentence_clean)
toxic['comment_text_clean'] = stentence_placeholder
toxic['comment_text_clean'] = [' '.join(i) for i in toxic['comment_text_clean']]

In [16]:
toxic

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed,comment_text_clean
0,explanation why the edits made under my userna...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[explan, whi, the, edit, made, under, my, user...",explan whi edit made usernam hardcor metallica...
1,d aww he matches this background colour i m s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour...","[d, aww, he, match, this, background, colour, ...",aww match background colour seem stuck thank t...
2,hey man i m really not trying to edit war it...,0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit...","[hey, man, i, m, realli, not, tri, to, edit, w...",hey man realli tri edit war guy constant remov...
3,more i can t make any real suggestions on im...,0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions...","[more, i, can, t, make, ani, real, suggest, on...",make ani real suggest improv wonder section st...
4,you sir are my hero any chance you remember...,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[you, sir, are, my, hero, ani, chanc, you, rem...",sir hero ani chanc rememb page
5,congratulations from me as well use the to...,0,0,0,0,0,0,"[congratulations, from, me, as, well, use, the...","[congratul, from, me, as, well, use, the, tool...",congratul well use tool well talk
6,cocksucker before you piss around on my work,1,1,1,0,1,0,"[cocksucker, before, you, piss, around, on, my...","[cocksuck, befor, you, piss, around, on, my, w...",cocksuck befor piss around work
7,your vandalism to the matt shirvington article...,0,0,0,0,0,0,"[your, vandalism, to, the, matt, shirvington, ...","[your, vandal, to, the, matt, shirvington, art...",vandal matt shirvington articl revert pleas ban
8,sorry if the word nonsense was offensive to ...,0,0,0,0,0,0,"[sorry, if, the, word, nonsense, was, offensiv...","[sorri, if, the, word, nonsens, was, offens, t...",sorri word nonsens offens anyway intend write ...
9,alignment on this subject and which are contra...,0,0,0,0,0,0,"[alignment, on, this, subject, and, which, are...","[align, on, this, subject, and, which, are, co...",align subject contrari dulithgow


In [17]:
toxic.to_csv('train_cleaned.csv', index=False)

### Create feature spaces

In [3]:
toxic = pd.read_csv('train_cleaned.csv')

**Drop NA**

In [25]:
toxic.describe(include='all')

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed,comment_text_clean
count,159521,159521.0,159521.0,159521.0,159521.0,159521.0,159521.0,159521,159521,159521
unique,159255,,,,,,,158206,158181,157648
top,jun utc,,,,,,,['january'],['januari'],thank experi wikipedia test work revert remov ...
freq,11,,,,,,,21,21,22
mean,,0.095875,0.009999,0.052965,0.002996,0.049379,0.008808,,,
std,,0.29442,0.099493,0.223964,0.054658,0.216659,0.093435,,,
min,,0.0,0.0,0.0,0.0,0.0,0.0,,,
25%,,0.0,0.0,0.0,0.0,0.0,0.0,,,
50%,,0.0,0.0,0.0,0.0,0.0,0.0,,,
75%,,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [5]:
toxic.dropna(axis=0, inplace=True)

**Split Train and Test**

In [6]:
x_train, x_test, y_train, y_test = train_test_split(toxic.loc[:,'comment_text_clean'], toxic.iloc[:,1:7], test_size = .3, random_state = 43)

In [7]:
x_train.head()

21524    thank note worri wait period get permiss owner...
56229    page need massiv edit initi section befor hit ...
93765                                       okaaaaaay test
87443    apologis make remark sidaway return perhap cou...
73667    newspap headlin newspap headlin blank adult sw...
Name: comment_text_clean, dtype: object

In [8]:
x_train.shape

(111664,)

In [9]:
y_train.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
21524,0,0,0,0,0,0
56229,0,0,0,0,0,0
93765,0,0,0,0,0,0
87443,0,0,0,0,0,0
73667,0,0,0,0,0,0


**Create feature spaces**

In [10]:
#Count Vectors as features

count_vect = CountVectorizer(max_features=5000)
count_vect.fit(x_train)
x_train_cv = count_vect.transform(x_train)
x_test_cv = count_vect.transform(x_test)

#TF-IDF Vectors as features

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(x_train)
x_train_tfidf =  tfidf_vect.transform(x_train)
x_test_tfidf =  tfidf_vect.transform(x_test)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(x_train)
x_train_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
x_test_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

In [11]:
feature_name_cv = count_vect.get_feature_names()
feature_name_tfidf = tfidf_vect.get_feature_names()
feature_name_ngram = tfidf_vect_ngram.get_feature_names()

In [12]:
print(feature_name_tfidf)

['aa', 'aaron', 'ab', 'abandon', 'abbrevi', 'abc', 'abid', 'abil', 'abl', 'abort', 'abov', 'abraham', 'abroad', 'absenc', 'absent', 'absolut', 'abstract', 'absurd', 'abund', 'abus', 'ac', 'academ', 'academi', 'acceler', 'accent', 'accept', 'access', 'accid', 'accident', 'accommod', 'accompani', 'accomplish', 'accord', 'account', 'accur', 'accuraci', 'accus', 'ace', 'achiev', 'acid', 'acknowledg', 'acquir', 'acronym', 'across', 'act', 'action', 'activ', 'activist', 'actor', 'actress', 'actual', 'ad', 'adam', 'adapt', 'add', 'addict', 'addit', 'address', 'adequ', 'adher', 'adject', 'adjust', 'admin', 'administ', 'administr', 'adminship', 'admir', 'admiss', 'admit', 'adolf', 'adopt', 'adress', 'adult', 'advanc', 'advantag', 'adventur', 'advert', 'advertis', 'advic', 'advis', 'advoc', 'advocaci', 'ae', 'aesthet', 'afc', 'afd', 'affair', 'affect', 'affili', 'affirm', 'afford', 'afghan', 'afghanistan', 'aforement', 'afraid', 'africa', 'african', 'afternoon', 'afterward', 'age', 'agenc', 'age

### Oversampling (SMOTE)

In [13]:
x_train_cv_os_all = []
y_train_cv_os_all = []

x_train_tfidf_os_all = []
y_train_tfidf_os_all = []

x_train_ngram_os_all = []
y_train_ngram_os_all = []


for i in range(6):
    sm_cv = SMOTE(random_state=40)
    x_train_cv_os, y_train_cv_os = sm_cv.fit_resample(x_train_cv, y_train.iloc[:,i])
    x_train_cv_os_all.append(x_train_cv_os)
    y_train_cv_os_all.append(y_train_cv_os)
    
    sm_tfidf = SMOTE(random_state=40)
    x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:,i])
    x_train_tfidf_os_all.append(x_train_tfidf_os)
    y_train_tfidf_os_all.append(y_train_tfidf_os)
    
    sm_ngram = SMOTE(random_state=40)
    x_train_ngram_os, y_train_ngram_os = sm_ngram.fit_resample(x_train_tfidf_ngram, y_train.iloc[:,i])
    x_train_ngram_os_all.append(x_train_ngram_os)
    y_train_ngram_os_all.append(y_train_ngram_os)

In [14]:
x_train_y_train_all = [x_train_cv_os_all, y_train_cv_os_all, x_train_tfidf_os_all, y_train_tfidf_os_all, x_train_ngram_os_all, y_train_ngram_os_all]

In [15]:
for i in x_train_cv_os_all:
    print(i.shape)

(201698, 5000)
(221074, 5000)
(211334, 5000)
(222628, 5000)
(212120, 5000)
(221320, 5000)


In [35]:
import pickle

# where do I want to store this file?
# Open the file to save as pkl file
train_data_path = 'train_data_array.pkl'
train_data_path_pkl = open(train_data_path, 'wb')
pickle.dump(x_train_y_train_all, train_data_path_pkl)

# Close the pickle instances
train_data_path_pkl.close()

In [49]:
x_test_y_test_all = [x_test_cv, y_test, x_test_tfidf, y_test, x_test_tfidf_ngram, y_test]

In [50]:
test_data_path = 'test_data_array.pkl'
test_data_path_pkl = open(test_data_path, 'wb')
pickle.dump(x_test_y_test_all, test_data_path_pkl)

In [52]:
x_train_y_train_all_load = pickle.load(open('train_data_array.pkl', 'rb'))
x_test_y_test_all_load = pickle.load(open('test_data_array.pkl', 'rb'))

In [44]:
print(x_train_y_train_all_test[0][0].shape)

(201698, 5000)


## Model Selection

### Logistic Regression

#### Count Vector Feature Space

In [18]:
y_predict_train_cv_all = []
y_predict_test_cv_all = []

acc_score_train_cv_all = []
acc_score_test_cv_all = []

f1_score_train_cv_all = []
f1_score_test_cv_all = []

confusion_matrix_train_cv_all = []
confusion_matrix_test_cv_all = []

classification_report_train_cv_all = []
classification_report_test_cv_all = []

for i in range(6):
    log_reg = LogisticRegression(solver='lbfgs')
    log_reg.fit(x_train_cv_os_all[i], y_train_cv_os_all[i])
    
    y_predict_train_cv_os = log_reg.predict(x_train_cv_os_all[i])
    y_predict_test_cv_os = log_reg.predict(x_test_cv)  

    y_predict_train_cv_all.append(y_predict_train_cv_os)
    y_predict_test_cv_all.append(y_predict_test_cv_os)
    
    acc_score_train_cv_all.append(accuracy_score(y_train_cv_os_all[i], y_predict_train_cv_os))
    acc_score_test_cv_all.append(accuracy_score(y_test.iloc[:,i], y_predict_test_cv_os))
    
    f1_score_train_cv_all.append(f1_score(y_train_cv_os_all[i], y_predict_train_cv_os))
    f1_score_train_cv_all.append(f1_score(y_test.iloc[:,i], y_predict_test_cv_os))
    
    confusion_matrix_train_cv_all.append(confusion_matrix(y_train_cv_os_all[i], y_predict_train_cv_os))
    confusion_matrix_test_cv_all.append(confusion_matrix(y_test.iloc[:,i], y_predict_test_cv_os))
    
    classification_report_train_cv_all.append(classification_report(y_train_cv_os_all[i], y_predict_train_cv_os))
    classification_report_test_cv_all.append(classification_report(y_test.iloc[:,i], y_predict_test_cv_os))



In [30]:
y_predict_train_tfidf_all = []
y_predict_test_tfidf_all = []

acc_score_train_tfidf_all = []
acc_score_test_tfidf_all = []

f1_score_train_tfidf_all = []
f1_score_test_tfidf_all = []

confusion_matrix_train_tfidf_all = []
confusion_matrix_test_tfidf_all = []

classification_report_train_tfidf_all = []
classification_report_test_tfidf_all = []

for i in range(6):
    log_reg = LogisticRegression(solver='lbfgs')
    log_reg.fit(x_train_tfidf_os_all[i], y_train_tfidf_os_all[i])
    
    y_predict_train_tfidf_os = log_reg.predict(x_train_tfidf_os_all[i])
    y_predict_test_tfidf_os = log_reg.predict(x_test_tfidf)  

    y_predict_train_tfidf_all.append(y_predict_train_tfidf_os)
    y_predict_test_tfidf_all.append(y_predict_test_tfidf_os)
    
    acc_score_train_tfidf_all.append(accuracy_score(y_train_tfidf_os_all[i], y_predict_train_tfidf_os))
    acc_score_test_tfidf_all.append(accuracy_score(y_test.iloc[:,i], y_predict_test_tfidf_os))
    
    f1_score_train_tfidf_all.append(f1_score(y_train_tfidf_os_all[i], y_predict_train_tfidf_os))
    f1_score_train_tfidf_all.append(f1_score(y_test.iloc[:,i], y_predict_test_tfidf_os))
    
    confusion_matrix_train_tfidf_all.append(confusion_matrix(y_train_tfidf_os_all[i], y_predict_train_tfidf_os))
    confusion_matrix_test_tfidf_all.append(confusion_matrix(y_test.iloc[:,i], y_predict_test_tfidf_os))
    
    classification_report_train_tfidf_all.append(classification_report(y_train_tfidf_os_all[i], y_predict_train_tfidf_os))
    classification_report_test_tfidf_all.append(classification_report(y_test.iloc[:,i], y_predict_test_tfidf_os))



In [27]:
y_predict_train_ngram_all = []
y_predict_test_ngram_all = []

acc_score_train_ngram_all = []
acc_score_test_ngram_all = []

f1_score_train_ngram_all = []
f1_score_test_ngram_all = []

confusion_matrix_train_ngram_all = []
confusion_matrix_test_ngram_all = []

classification_report_train_ngram_all = []
classification_report_test_ngram_all = []

for i in range(6):
    log_reg = LogisticRegression(solver='lbfgs')
    log_reg.fit(x_train_ngram_os_all[i], y_train_ngram_os_all[i])
    
    y_predict_train_ngram_os = log_reg.predict(x_train_ngram_os_all[i])
    y_predict_test_ngram_os = log_reg.predict(x_test_tfidf_ngram)  

    y_predict_train_ngram_all.append(y_predict_train_ngram_os)
    y_predict_test_ngram_all.append(y_predict_test_ngram_os)
    
    acc_score_train_ngram_all.append(accuracy_score(y_train_ngram_os_all[i], y_predict_train_ngram_os))
    acc_score_test_ngram_all.append(accuracy_score(y_test.iloc[:,i], y_predict_test_ngram_os))
    
    f1_score_train_ngram_all.append(f1_score(y_train_ngram_os_all[i], y_predict_train_ngram_os))
    f1_score_train_ngram_all.append(f1_score(y_test.iloc[:,i], y_predict_test_ngram_os))
    
    confusion_matrix_train_ngram_all.append(confusion_matrix(y_train_ngram_os_all[i], y_predict_train_ngram_os))
    confusion_matrix_test_ngram_all.append(confusion_matrix(y_test.iloc[:,i], y_predict_test_ngram_os))
    
    classification_report_train_ngram_all.append(classification_report(y_train_ngram_os_all[i], y_predict_train_ngram_os))
    classification_report_test_ngram_all.append(classification_report(y_test.iloc[:,i], y_predict_test_ngram_os))



In [48]:
for i in range(6): print(classification_report_test_tfidf_all[i])

              precision    recall  f1-score   support

           0       0.98      0.92      0.95     43378
           1       0.53      0.85      0.65      4479

   micro avg       0.91      0.91      0.91     47857
   macro avg       0.76      0.89      0.80     47857
weighted avg       0.94      0.91      0.92     47857

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     47389
           1       0.18      0.89      0.30       468

   micro avg       0.96      0.96      0.96     47857
   macro avg       0.59      0.92      0.64     47857
weighted avg       0.99      0.96      0.97     47857

              precision    recall  f1-score   support

           0       0.99      0.95      0.97     45405
           1       0.50      0.89      0.64      2452

   micro avg       0.95      0.95      0.95     47857
   macro avg       0.75      0.92      0.81     47857
weighted avg       0.97      0.95      0.96     47857

              preci

In [32]:
for i in range(6):
    print(confusion_matrix_test_tfidf_all[i])

[[39973  3405]
 [  673  3806]]
[[45479  1910]
 [   52   416]]
[[43245  2160]
 [  268  2184]]
[[47269   460]
 [   40    88]]
[[42379  3205]
 [  306  1967]]
[[45550  1906]
 [   85   316]]


In [33]:
for i in range(6):
    print(confusion_matrix_test_cv_all[i])

[[38609  4769]
 [  831  3648]]
[[45064  2325]
 [  148   320]]
[[42008  3397]
 [  424  2028]]
[[45891  1838]
 [   76    52]]
[[41322  4262]
 [  556  1717]]
[[44520  2936]
 [  206   195]]


In [34]:
for i in range(6):
    print(confusion_matrix_test_ngram_all[i])

[[28224 15154]
 [ 1019  3460]]
[[33405 13984]
 [   71   397]]
[[30261 15144]
 [  552  1900]]
[[33741 13988]
 [   38    90]]
[[30364 15220]
 [  509  1764]]
[[32493 14963]
 [  109   292]]
