# Loading the data

In [1]:
import numpy as np
import pandas as pd
import pylab as plt
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('../../dataset/train.csv')

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
print ("The Columns:", df.columns.values)

The Columns: ['id' 'comment_text' 'toxic' 'severe_toxic' 'obscene' 'threat' 'insult'
 'identity_hate']


In [5]:
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
for col in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
  print (col, "\tunique values:", df[col].unique() )


toxic 	unique values: [0 1]
severe_toxic 	unique values: [0 1]
obscene 	unique values: [0 1]
threat 	unique values: [0 1]
insult 	unique values: [0 1]
identity_hate 	unique values: [0 1]


In [7]:
tokenizer = TweetTokenizer()
df['tokens'] = df['comment_text'].str.lower().apply(tokenizer.tokenize)

In [8]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokens
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[d'aww, !, he, matches, this, background, colo..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, ,, i'm, really, not, trying, to, ed..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"["", more, i, can't, make, any, real, suggestio..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[you, ,, sir, ,, are, my, hero, ., any, chance..."


In [9]:
cachedStopWords = stopwords.words("english")
def remove_stopwords(tokens):
    return [token for token in tokens if token not in cachedStopWords]

df['tokens_nostop'] = df['tokens'].apply(remove_stopwords)

In [10]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokens,tokens_nostop
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[explanation, edits, made, username, hardcore,..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[d'aww, !, he, matches, this, background, colo...","[d'aww, !, matches, background, colour, i'm, s..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, ,, i'm, really, not, trying, to, ed...","[hey, man, ,, i'm, really, trying, edit, war, ..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"["", more, i, can't, make, any, real, suggestio...","["", can't, make, real, suggestions, improvemen..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[you, ,, sir, ,, are, my, hero, ., any, chance...","[,, sir, ,, hero, ., chance, remember, page, t..."


In [11]:
def join_tokens(tokens):
    return ' '.join(tokens)

df['join_tokens'] = df['tokens_nostop'].apply(join_tokens)

In [18]:
df.drop(['id','comment_text'], axis=1, inplace=True)
df.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokens,tokens_nostop,join_tokens
0,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[explanation, edits, made, username, hardcore,...",explanation edits made username hardcore metal...
1,0,0,0,0,0,0,"[d'aww, !, he, matches, this, background, colo...","[d'aww, !, matches, background, colour, i'm, s...",d'aww ! matches background colour i'm seemingl...
2,0,0,0,0,0,0,"[hey, man, ,, i'm, really, not, trying, to, ed...","[hey, man, ,, i'm, really, trying, edit, war, ...","hey man , i'm really trying edit war . guy con..."
3,0,0,0,0,0,0,"["", more, i, can't, make, any, real, suggestio...","["", can't, make, real, suggestions, improvemen...",""" can't make real suggestions improvement - wo..."
4,0,0,0,0,0,0,"[you, ,, sir, ,, are, my, hero, ., any, chance...","[,, sir, ,, hero, ., chance, remember, page, t...",", sir , hero . chance remember page that's ?"


Bag of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df.join_tokens.to_list()).toarray()

In [14]:
np.shape(bow_matrix)

(159571, 188076)

Too many basis words. Use PCA to cut them.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca_pipe = Pipeline([('scale', StandardScaler()), ('pca', PCA())])

pca_pipe.fit(bow_matrix)

n-grams

In [None]:
bigrams = CountVectorizer(ngram_range=(2,2))
bigrams_matrix = bigrams.fit_transform(df.join_tokens.to_list())

In [None]:
names = bigrams.get_feature_names()

In [None]:
import random
random.choices(names, k=10)

In [None]:
np.shape(bigrams_matrix)

TF-IDF

In [None]:
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(df.join_tokens.to_list()).toarray()

In [None]:
np.shape(tfidf_matrix)