# MsCA 31009 - Machine Learning and Predictive Analytics

## Project

## Import files and libraries.

In [2]:
#!pip3 install autocorrect

In [3]:
import pandas as pd
import numpy as np

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from autocorrect import spell

from sklearn.feature_extraction.text import CountVectorizer

import re

[nltk_data] Downloading package punkt to /home/targoon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/targoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Download train data.**

In [4]:
!wget 'https://drive.google.com/uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp'
!unzip -o 'uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp'

--2018-10-21 08:56:33--  https://drive.google.com/uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp
Resolving drive.google.com (drive.google.com)... 172.217.10.14, 2607:f8b0:4009:80f::200e
Connecting to drive.google.com (drive.google.com)|172.217.10.14|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-00-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/02m0hirjsujhgkfrghpoj5v49icojk0l/1540123200000/00285997938321528797/*/1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp?e=download [following]
--2018-10-21 08:56:37--  https://doc-00-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/02m0hirjsujhgkfrghpoj5v49icojk0l/1540123200000/00285997938321528797/*/1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp?e=download
Resolving doc-00-4c-docs.googleusercontent.com (doc-00-4c-docs.googleusercontent.com)... 216.58.192.225, 2607:f8b0:4009:80f::2001
Connecting to doc-00-4c-docs.googleusercontent.com (doc-00-4c-docs.go

In [5]:
toxic = pd.read_csv('train.csv')

## Data Pre-Processing

In [6]:
toxic.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


**Remove ID column.**

In [7]:
toxic.drop(['id'], axis=1, inplace=True)

**Remove non-alphabet characters**

In [8]:
toxic['comment_text'] = [re.sub('[^A-Za-z]', ' ', i).lower() for i in toxic['comment_text']]

**Tokenization**

In [9]:
toxic['comment_text_tokenize'] = [word_tokenize(i) for i in toxic['comment_text']]

In [10]:
toxic.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize
0,explanation why the edits made under my userna...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my..."
1,d aww he matches this background colour i m s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour..."
2,hey man i m really not trying to edit war it...,0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit..."
3,more i can t make any real suggestions on im...,0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions..."
4,you sir are my hero any chance you remember...,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re..."


**Standardize contraction**

In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"cant", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

**Stemming**

In [30]:
stemmer = SnowballStemmer('english')
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize']:
    sentence_stemmed = [stemmer.stem(clean_text(word)) for word in sentence]
    stentence_placeholder.append(sentence_stemmed)
toxic['comment_text_tokenize_stemmed'] = stentence_placeholder

**Stopwords Removal**

In [31]:
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize_stemmed']:
    sentence_clean = [word for word in sentence if word not in stopwords.words('english')]
    stentence_placeholder.append(sentence_clean)
toxic['comment_text_clean'] = stentence_placeholder
toxic['comment_text_clean'] = [' '.join(i) for i in toxic['comment_text_clean']]

In [34]:
toxic

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_clean,comment_text_tokenize_stemmed
0,explanation why the edits made under my userna...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...",explan whi edit made usernam hardcor metallica...,"[explan, whi, the, edit, made, under, my, user..."
1,d aww he matches this background colour i m s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour...",aww match background colour seem stuck thank t...,"[d, aww, he, match, this, background, colour, ..."
2,hey man i m really not trying to edit war it...,0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit...",hey man realli tri edit war guy constant remov...,"[hey, man, i, m, realli, not, tri, to, edit, w..."
3,more i can t make any real suggestions on im...,0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions...",make ani real suggest improv wonder section st...,"[more, i, can, t, make, ani, real, suggest, on..."
4,you sir are my hero any chance you remember...,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...",sir hero ani chanc rememb page,"[you, sir, are, my, hero, ani, chanc, you, rem..."
5,congratulations from me as well use the to...,0,0,0,0,0,0,"[congratulations, from, me, as, well, use, the...",congratul well use tool well talk,"[congratul, from, me, as, well, use, the, tool..."
6,cocksucker before you piss around on my work,1,1,1,0,1,0,"[cocksucker, before, you, piss, around, on, my...",cocksuck befor piss around work,"[cocksuck, befor, you, piss, around, on, my, w..."
7,your vandalism to the matt shirvington article...,0,0,0,0,0,0,"[your, vandalism, to, the, matt, shirvington, ...",vandal matt shirvington articl revert pleas ban,"[your, vandal, to, the, matt, shirvington, art..."
8,sorry if the word nonsense was offensive to ...,0,0,0,0,0,0,"[sorry, if, the, word, nonsense, was, offensiv...",sorri word nonsens offens anyway intend write ...,"[sorri, if, the, word, nonsens, was, offens, t..."
9,alignment on this subject and which are contra...,0,0,0,0,0,0,"[alignment, on, this, subject, and, which, are...",align subject contrari dulithgow,"[align, on, this, subject, and, which, are, co..."


In [35]:
toxic.to_csv('train_cleaned.csv')

**Convert text into corpus & extract labels**

In [36]:
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(toxic['comment_text_clean']).toarray()
y = toxic.iloc[:,1:7]

In [37]:
print(vectorizer.get_feature_names())

['abl', 'abov', 'absolut', 'abus', 'academ', 'accept', 'access', 'accord', 'account', 'accur', 'accus', 'across', 'act', 'action', 'activ', 'actual', 'ad', 'add', 'addit', 'address', 'admin', 'administr', 'admit', 'advertis', 'advic', 'afd', 'age', 'ago', 'agre', 'ahead', 'aid', 'air', 'al', 'album', 'align', 'alleg', 'allow', 'almost', 'alon', 'along', 'alreadi', 'also', 'altern', 'although', 'alway', 'america', 'american', 'among', 'amount', 'ani', 'anim', 'anonym', 'anoth', 'answer', 'anti', 'anyon', 'anyth', 'anyway', 'apolog', 'appar', 'appear', 'appli', 'appreci', 'appropri', 'april', 'arab', 'arbitr', 'archiv', 'area', 'argu', 'argument', 'armi', 'around', 'art', 'articl', 'artist', 'ask', 'ass', 'assert', 'asshol', 'associ', 'assum', 'attack', 'attempt', 'attent', 'august', 'author', 'automat', 'avail', 'avoid', 'awar', 'award', 'away', 'back', 'background', 'bad', 'ball', 'ban', 'band', 'bark', 'barnstar', 'base', 'basi', 'basic', 'battl', 'becaus', 'becom', 'befor', 'begin', 