In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns

import nltk
from nltk.tokenize import (sent_tokenize, TreebankWordTokenizer, WhitespaceTokenizer)
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams
from nltk.corpus import stopwords

from collections import Counter
from operator import itemgetter

from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import re

# from sqlalchemy import create_engine

from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix, precision_score, 
                             recall_score, f1_score, roc_curve, roc_auc_score, average_precision_score, 
                             precision_recall_curve, auc)

from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import KFold  
from sklearn.preprocessing import StandardScaler 

from xgboost import XGBClassifier

import datetime
import warnings
import os

warnings.filterwarnings("ignore")



In [2]:
# helper functions
def sentence_tokenizer(text):
    sentences = sent_tokenize(text)
    return sentences

def whitespace_tokenizer(sentences):
    listy = []
    tokenizer = WhitespaceTokenizer()
    for i in list(range(0,len(sentences))):
        tokenized = tokenizer.tokenize(sentences[i])
        listy.append(tokenized)
    return listy

def polarity(sentences):
    listy = []
    for i in list(range(0,len(sentences))):
        pol = TextBlob(sentences[i]).polarity
        listy.append(pol)
    return np.min(listy), np.max(listy), np.mean(listy),listy

def stemmer(text):
    stemmer = PorterStemmer()
    listy = []
    for word in TextBlob(text).words:
        listy.append(stemmer.stem(word))
    return listy

# def count_vectorizer(text):
#     vectorizer = CountVectorizer(ngram_range=(1,2))
#     _ = vectorizer.fit(text)
#     x = vectorizer.transform(text)
#     x_back = x.toarray()
#     return x_back

def token_clean(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = text.lower().split()
    return text

In [3]:
df_train = pd.read_csv('../data/train.csv') # train data
# df_test = pd.read_csv('../data/test.csv') # test data
df = df_train
print(df.shape)
df.head()

(159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df.sum(axis=0,numeric_only=True)

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [5]:
# note that classes are not mutually exclusive, any comment to belong to any of 6 classes
# as such, may need to test each classification separately, unless there is a way to test all together?
df['rating'] = df['toxic'] + df['severe_toxic'] + df['obscene'] + df['threat'] + df['insult'] + df['identity_hate']
df['token_clean'] = df['comment_text'].apply(token_clean)
df['sent_token'] = df['comment_text'].apply(sentence_tokenizer)
df['polarity_sent_token'] = df['sent_token'].apply(polarity)
df['word_count'] = df['token_clean'].apply(len)

In [6]:
df['polarity_min'] = [x[0] for x in df['polarity_sent_token']]
df['polarity_max'] = [x[1] for x in df['polarity_sent_token']]
df['polarity_mean'] = [x[2] for x in df['polarity_sent_token']]
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,rating,token_clean,sent_token,polarity_sent_token,word_count,polarity_min,polarity_max,polarity_mean
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...",[Explanation\nWhy the edits made under my user...,"(0.0, 0.136363636364, 0.0454545454545, [0.0, 0...",43,0.0,0.136364,0.045455
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,"[daww, he, matches, this, background, colour, ...","[D'aww!, He matches this background colour I'm...","(0.0, 0.375, 0.14375, [0.375, 0.0, 0.2, 0.0])",17,0.0,0.375,0.14375
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,"[hey, man, im, really, not, trying, to, edit, ...","[Hey man, I'm really not trying to edit war., ...","(-0.1, 0.25, 0.116666666667, [-0.1, 0.2, 0.25])",42,-0.1,0.25,0.116667
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,"[more, i, cant, make, any, real, suggestions, ...","[""\nMore\nI can't make any real suggestions on...","(0.0, 0.4, 0.17875, [0.19, 0.125, 0.0, 0.4])",109,0.0,0.4,0.17875
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[You, sir, are my hero., Any chance you rememb...","(0.0, 0.0, 0.0, [0.0, 0.0])",13,0.0,0.0,0.0


In [7]:
# df['stemmer'] = df['comment_text'].apply(stemmer)

In [8]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,rating,token_clean,sent_token,polarity_sent_token,word_count,polarity_min,polarity_max,polarity_mean
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...",[Explanation\nWhy the edits made under my user...,"(0.0, 0.136363636364, 0.0454545454545, [0.0, 0...",43,0.0,0.136364,0.045455
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,"[daww, he, matches, this, background, colour, ...","[D'aww!, He matches this background colour I'm...","(0.0, 0.375, 0.14375, [0.375, 0.0, 0.2, 0.0])",17,0.0,0.375,0.14375
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,"[hey, man, im, really, not, trying, to, edit, ...","[Hey man, I'm really not trying to edit war., ...","(-0.1, 0.25, 0.116666666667, [-0.1, 0.2, 0.25])",42,-0.1,0.25,0.116667
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,"[more, i, cant, make, any, real, suggestions, ...","[""\nMore\nI can't make any real suggestions on...","(0.0, 0.4, 0.17875, [0.19, 0.125, 0.0, 0.4])",109,0.0,0.4,0.17875
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[You, sir, are my hero., Any chance you rememb...","(0.0, 0.0, 0.0, [0.0, 0.0])",13,0.0,0.0,0.0


In [9]:
# df.groupby('rating').nunique()['id'] # class imbalance issue

In [10]:
df = df.sort_values(['rating'],ascending=[False])
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,rating,token_clean,sent_token,polarity_sent_token,word_count,polarity_min,polarity_max,polarity_mean
106617,3a4c7758fad18de3,", I hope your retarded kids get anal raped and...",1,1,1,1,1,1,6,"[i, hope, your, retarded, kids, get, anal, rap...","[, I hope your retarded kids get anal raped an...","(-0.6, 1.0, -0.01875, [-0.4, -0.4, 0.55, 0.15,...",94,-0.6,1.0,-0.01875
13964,24d2b50726b67167,I am going to murder ZimZalaBim ST47 for being...,1,1,1,1,1,1,6,"[i, am, going, to, murder, zimzalabim, st47, f...",[I am going to murder ZimZalaBim ST47 for bein...,"(-1.0, -1.0, -1.0, [-1.0])",12,-1.0,-1.0,-1.0
73821,c586b7a2fd575b13,"Shut up you asswipe, we don't care. I'll decap...",1,1,1,1,1,1,6,"[shut, up, you, asswipe, we, dont, care, ill, ...","[Shut up you asswipe, we don't care., I'll dec...","(-0.2, 0.0, -0.0755555555556, [0.0, -0.1777777...",24,-0.2,0.0,-0.075556
151131,77d84b1321c22d9a,"LGBT \n\nyou little fuck , are you a fag , tha...",1,1,1,1,1,1,6,"[lgbt, you, little, fuck, are, you, a, fag, th...","[LGBT \n\nyou little fuck , are you a fag , th...","(-0.1775, 0.0, -0.08875, [-0.17750000000000005...",56,-0.1775,0.0,-0.08875
7299,1368c10281978876,"You're a stupid cunt \n\nFuck you dumb arse, y...",1,1,1,1,1,1,6,"[youre, a, stupid, cunt, fuck, you, dumb, arse...","[You're a stupid cunt \n\nFuck you dumb arse, ...","(-0.4125, -0.4125, -0.4125, [-0.41250000000000...",59,-0.4125,-0.4125,-0.4125


In [None]:
count_vect = CountVectorizer()
X = count_vect.fit_transform(df.comment_text)
# X = X.toarray()
print(X.shape) 
# count_vect.vocabulary_
# X = df[['word_count','polarity_min','polarity_max','polarity_mean']]
y = df['toxic']
# y = df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

(159571, 189775)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

k_range = list(range(1, 101))
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
scaler = StandardScaler(with_mean=False).fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

In [None]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
pipeline = Pipeline([
     ('vectorizer', CountVectorizer()), 
     ('to_dense', DenseTransformer()), 
     ('classifier', RandomForestClassifier())
])

In [None]:
from sklearn.svm import LinearSVC
pipeline = Pipeline([('vectorizer', CountVectorizer()), ('classifier', LinearSVC())])

In [None]:
# Define models to test
model_list = [['GaussianNB', GaussianNB()], 
                ['BernoulliNB', BernoulliNB()], 
#                 ['MultinomialNB', MultinomialNB()],
                ['DecisionTree', DecisionTreeClassifier()], 
                ['KNN', KNeighborsClassifier(10)], 
                ['RandomForest', RandomForestClassifier()], 
                ['GradientBoost', GradientBoostingClassifier()],
                ['AdaBoost', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=DecisionTreeClassifier())],
                ['XGBoost', XGBClassifier()],
                ['LogisticRegression', LogisticRegression()],          
                ['SVM', SVC(probability=True)]] # scale data; F1 0.57

model_list_s = ['KNN','LogisticRegression','SVM'] # standardize/normalize data

# Calculate metrics for each model
roc = {}
results_dict = {}
for model in model_list:
    if model[0] in model_list_s:
        X_train = X_train_s
        X_test = X_test_s
    
    model_name = model[0]
    model = model[1]
    
    accuracy = []
    precision_1 = []
    precision_0 = []
    recall_1 = []
    recall_0 = []
    f1_1 = []
    f1_0 = []
    auc = []
        
    # Perform K-Fold CV and calculate metrics for each fold
    kf = KFold(5, random_state=42, shuffle=True) 
    for train_idx, test_idx in kf.split(X, y=y):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred ,pos_label=1))
        precision_0.append(precision_score(y_test, y_pred ,pos_label=0))
        recall_1.append(recall_score(y_test, y_pred, pos_label=1))
        recall_0.append(recall_score(y_test, y_pred, pos_label=0))
        f1_1.append(f1_score(y_test, y_pred, pos_label=1))
        f1_0.append(f1_score(y_test, y_pred, pos_label=0))
        auc.append(roc_auc_score(y_test, y_pred))
        
    # Calculate mean metric across K-folds
    mean_accuracy = np.mean(accuracy)
    mean_precision_1 = np.mean(precision_1)
    mean_precision_0 = np.mean(precision_0)
    mean_recall_1 = np.mean(recall_1)
    mean_recall_0 = np.mean(recall_0)
    mean_f1_1 = np.mean(f1_1)
    mean_f1_0 = np.mean(f1_0)
    mean_auc = np.mean(auc)
    
    # Capture TPR and FPR from last fold for plotting
    y_score = model.predict_proba(X_test)[:,1]
    roc[model_name] = roc_curve(y_test, y_score), mean_auc
    results_dict[model_name] = {"accuracy": mean_accuracy, "precision_s": mean_precision_1, "precision_f": mean_precision_0, "recall_s": mean_recall_1, "recall_f": mean_recall_0, "f1_s": mean_f1_1, "f1_f": mean_f1_0, "auc": mean_auc}
    
    # Print formatted results
    print(model)
    print('\t==============================')
    print('\tAccuracy:', mean_accuracy)
    print('\tAUC:', mean_auc)
    print('\n')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred)) 

In [None]:
# Plot the ROC curve from the last K-Fold split
fig, ax = plt.subplots(figsize=(10, 10))

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')

# Plot Classifier ROC Curves
for key, value in roc.items():
    label = '{}, AUC: {}%'.format(key, round(100*value[1],1))
    ax.plot(roc[key][0][0], roc[key][0][1], label=label)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('ROC Curve - All Models',fontweight='bold',fontsize=15)
ax.legend(loc='best')
plt.savefig('../charts/toxic_roc.png')

In [None]:
rd = pd.DataFrame(results_dict).T
rd = rd.apply(lambda x: round(100*x,1).astype(str) + "%")
rd = rd.sort_values(['auc'],ascending=[False])
rd

In [None]:
# cv = CountVectorizer()

# r = pd.SparseDataFrame(cv.fit_transform(text), 
#                        df.index,
#                        cv.get_feature_names(), 
#                        default_fill_value=0)

In [None]:
# text = df['comment_text'].iloc[0]
# x_back = count_vectorizer(text)
# df1 = pd.DataFrame(x_back,columns=vectorizer.get_feature_names())

In [None]:
# stop = stopwords.words('english')
# stop += ['.', ',', '(', ')', "'", '"']
# stop = set(stop)

# counter = Counter()

# n = 2
# for doc in df['comment_text']:
#     words = TextBlob(doc).words
#     words = [w for w in words if w not in stop]
#     bigrams = ngrams(words, n)
#     counter += Counter(bigrams)

# for phrase, count in counter.most_common(30):
#     print('%20s %i' % (" ".join(phrase), count))

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

text = ['That is should come to this!', 'This above all: to thine own self be true.', 'Something is rotten in the state of Denmark.']

# # CountVectorizer is a class; so `vectorizer` below represents an instance of that object.
# vectorizer = CountVectorizer(ngram_range=(1,2)) # selects uni and bigrams

# # call `fit` to build the vocabulary
# vectorizer.fit(text)

# # then, use `get_feature_names` to return the tokens
# print(vectorizer.get_feature_names())

# # finally, call `transform` to convert text to a bag of words
# x = vectorizer.transform(text)

In [None]:
# print('Sparse Matrix')
# # A compressed version; the "sparse" matrix.
# print(type(x))
# print(x)

# print ('Matrix')
# x_back = x.toarray()
# print(type(x_back))
# print(x_back)

In [None]:
# x_back = df['token_clean'].apply(count_vectorizer)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df['comment_text'])
print(X_train_counts.shape)
X_train_counts

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [None]:
pd.DataFrame(x_back, columns=vectorizer.get_feature_names())

In [None]:
# x_back = count_vectorizer(text)
# pd.DataFrame(x_back, columns=vectorizer.get_feature_names())

In [None]:
#### TF: frequency in this document
#### IDF: inverse frequency in the corpus

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
doc_vectors = vectorizer.fit_transform(text)

classes = np.array(['pos']*50 + ['neg']*50)


model = MultinomialNB().fit(doc_vectors, classes)

In [None]:
# sentences = df['sent_token'].iloc[0]
# whitespace_tokenizer(sentences)

In [None]:
# tokenizer = TreebankWordTokenizer()
# tokenizer.tokenize(sentences[2])

In [None]:
# tokenizer = WhitespaceTokenizer()
# tokenizer.tokenize(sentences[2])

In [None]:
# df.head()

In [None]:
df.to_pickle('../data/toxictrain.pkl')