In [1]:
import pandas as pd
import numpy as np

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

[nltk_data] Error loading stopwords: <urlopen error Tunnel connection
[nltk_data]     failed: 407 Proxy Authentication Required>


## Import Data

In [2]:
data = pd.read_csv("train.csv",usecols=["id","comment_text", "toxic"])

In [3]:
data.head()

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


## Data Cleaning and Pre-Processing

In [4]:
data.toxic.value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [5]:
# 10.6 percent of comments are toxic 
data[data["toxic"]==1].count()[1] / data[data["toxic"]==0].count()[1] *100

10.600442204925248

## Process text data 

In [40]:
# Create a new column to work on the text
data['clean_text'] = data['comment_text']

In [41]:
# remove new line, make lower case, remove apostrophe 

def remove_new_line(mystring):
    mystring = mystring.replace("\n", " ") #  removes the new line characters 
    mystring = mystring.lower()            # turns the whole string to lower case letters 
    mystring = mystring.replace("'", "")   # removes appostrophes 
    return(mystring)

In [42]:
# create regex that will allow for grabing only the words in the comments
prog = re.compile("[a-z]+")
#result = prog.findall(test_s_2)

In [43]:
def grab_words(words):
    '''grabs only the words from the comments and returns them in a string'''
    return(' '.join(prog.findall(words)))

In [44]:
def remove_stop_words(word_list):
    new_words = []
    for word in word_list:
        if not word in stopwords.words('english'):
            new_words.append(word)
    return(new_words)

### Run data pre-processing -- Use bi- grams only

In [74]:
data["clean_text"] = data["clean_text"].map(remove_new_line)

In [75]:
data["clean_text"] = data["clean_text"].map(grab_words)

In [76]:
# run the cleaned data through a vectorizer which also removes the english stopwords 
vectorizer = CountVectorizer(stop_words='english',ngram_range=(2, 2) )
X = vectorizer.fit_transform(data.clean_text)
X = X.tocsc()
y = np.array(data.toxic , dtype=int)

In [77]:
# Split the data for training and testing  
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# reshape for ml to work 
y_train = np.array(Y_train, dtype=int)
y_test = np.array(Y_test, dtype=int)

## Run Naive Bayes Classifier 

In [79]:
# create an instance of the Naive Bayes classifier 
mNB = MultinomialNB()
# fit the training data. 
mNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [80]:
mNB.score(X_train, y_train)

0.9869336341417559

In [81]:
#  Accruacy score for the test set  
mNB.score(X_test, y_test)

0.7159329468901771

In [82]:
metrics.roc_auc_score(y_test,mNB.predict_proba(X_test)[:,1])

0.7406532317991665

In [83]:
print(metrics.f1_score(y_test, mNB.predict(X_test)))

0.29949003245248035


### Run data pre-processing -- Use tri- grams only

In [84]:
data["clean_text"] = data["clean_text"].map(remove_new_line)

In [85]:
data["clean_text"] = data["clean_text"].map(grab_words)

In [86]:
# run the cleaned data through a vectorizer which also removes the english stopwords 
vectorizer = CountVectorizer(stop_words='english',ngram_range=(3, 3) )
X = vectorizer.fit_transform(data.clean_text)
X = X.tocsc()
y = np.array(data.toxic , dtype=int)

In [87]:
# Split the data for training and testing  
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
# reshape for ml to work 
y_train = np.array(Y_train, dtype=int)
y_test = np.array(Y_test, dtype=int)

## Run Naive Bayes Classifier 

In [89]:
# create an instance of the Naive Bayes classifier 
mNB = MultinomialNB()
# fit the training data. 
mNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [90]:
mNB.score(X_train, y_train)

0.9878188255937833

In [91]:
#  Accruacy score for the test set  
mNB.score(X_test, y_test)

0.32928090239699204

In [92]:
metrics.roc_auc_score(y_test,mNB.predict_proba(X_test)[:,1])

0.48373121100261984

In [93]:
print(metrics.f1_score(y_test, mNB.predict(X_test)))

0.169988367584335


### Run data pre-processing -- Use singles, bi-grams and tri-grams 

In [94]:
data["clean_text"] = data["clean_text"].map(remove_new_line)

In [95]:
data["clean_text"] = data["clean_text"].map(grab_words)

In [96]:
# run the cleaned data through a vectorizer which also removes the english stopwords 
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1, 3) )
X = vectorizer.fit_transform(data.clean_text)
X = X.tocsc()
y = np.array(data.toxic , dtype=int)

In [97]:
# Split the data for training and testing  
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
# reshape for ml to work 
y_train = np.array(Y_train, dtype=int)
y_test = np.array(Y_test, dtype=int)

## Run Naive Bayes Classifier 

In [99]:
# create an instance of the Naive Bayes classifier 
mNB = MultinomialNB()
# fit the training data. 
mNB.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [100]:
mNB.score(X_train, y_train)

0.988320172964843

In [101]:
#  Accruacy score for the test set  
mNB.score(X_test, y_test)

0.9378035406548645

In [102]:
metrics.roc_auc_score(y_test,mNB.predict_proba(X_test)[:,1])

0.8753900871886763

In [103]:
print(metrics.f1_score(y_test, mNB.predict(X_test)))

0.6258246936852027


## RESULTS OF ADDING BI-GRAMS and TRI_GRAMS

Type of Vectorizer |Train Accuracy| Test Accuracy| AUC | F1-score
--- | --- | --- | --- | ---
SINGLES  | 95.7% | 94.1% |0.90 | 0.67
ALL THREE | 98.8% | 93.8% | 0.88 | 0.63 
BI-GRAMS | 98.7% | 71.6% | 0.74 | 0.30
TRI-GRAMS | 98.8% | 32.9% | 0.48 | 0.17


## We can see from the above chart that adding bi-grams and tri-grams did not add to the effectiveness of the model. 