# Toxic comments

This notebook takes you though a complete iteration of Machine Learning Assignment 1 - Toxic comments. The assignment details (including links to download the data) can be found [here](https://docs.google.com/document/d/1WGYw99e5q6j5V0Zrf2HveagU6URt_kVvdR8B9HYQ99E/edit?usp=sharing). 

In [None]:
# %%time
# # need to activate deeplearning environment in anaconda prompt each time
# # https://www.youtube.com/watch?v=MIkZ6cDE53w
# import keras

In [1]:
# all imports and magic commands
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from my_measures import BinaryClassificationPerformance
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
%matplotlib inline


from numpy import array
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
# from keras.preprocessing.text import Tokenizer
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Dropout

### IMPORTANT!!! Make sure you are using `BinaryClassificationPerformance` v1.02

In [None]:
help(BinaryClassificationPerformance)

# Function for feature building and extraction on natural language data

In [2]:
# function that takes raw data and completes all preprocessing required before model fits
def process_raw_data(fn, my_random_seed, test=False):
    toxic_data = pd.read_csv(fn)
    if (not test):
        # add an indicator for any toxic, severe toxic, obscene, threat, insult, or indentity hate
        toxic_data['any_toxic'] = (toxic_data['toxic'] + toxic_data['severe_toxic'] + toxic_data['obscene'] + toxic_data['threat'] + toxic_data['insult'] + toxic_data['identity_hate'] > 0)
    print("toxic_data is:", type(toxic_data))
    print("toxic_data has", toxic_data.shape[0], "rows and", toxic_data.shape[1], "columns", "\n")
    print("the data types for each of the columns in toxic_data:")
    print(toxic_data.dtypes, "\n")
    print("the first 5 rows in toxic_data:")
    print(toxic_data.head(5))

    if (not test):
        print("The rate of 'toxic' Wikipedia comments in the dataset: ")
        print(toxic_data['any_toxic'].mean())

    if (not test): # fit_transform()
        hv = HashingVectorizer(n_features=2 ** 14, alternate_sign=False, 
                               ngram_range=(1,2), token_pattern=r'\b\w+\b', 
                               stop_words='english')       
        X_hv = hv.fit_transform(toxic_data.comment_text)
        fitted_transformations.append(hv)
        print("Shape of HashingVectorizer X:")
        print(X_hv.shape)
    else: # transform() 
        X_hv = fitted_transformations[0].transform(toxic_data.comment_text)
        print("Shape of HashingVectorizer X:")
        print(X_hv.shape)
    # submission condition is test: what's the diff between test and not test here?  
    # 'fitted_transformations.append' vs. '.transform'.  Why in the test condition does transform the data
    # "fit" computes the mean and std to be used for later scaling. (just a computation) 
    # "transform" uses a previously computed mean and std to autoscale the data
        
    if (not test):
        chv = HashingVectorizer(n_features=2 ** 14, alternate_sign=False, 
                                analyzer='char_wb', ngram_range=(2,4))
        X_chv = chv.fit_transform(toxic_data.comment_text)
        fitted_transformations.append(chv)
        print("Shape of HashingVectorizer char n_gram X:")
        print(X_chv.shape)
    else:
        X_chv = fitted_transformations[1].transform(toxic_data.comment_text)
        print("Shape of HashingVectorizer char n_gram X:")
        print(X_chv.shape)
    
#     X_cv = hstack([X_hv, X_chv])
#     print("Shape of Vectorizer combined X:")
#     print(X_cv.shape)
    
    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
#     if (not test):
#         transformer = TfidfTransformer()
#         X_tfidf = transformer.fit_transform(X_cv)
#         fitted_transformations.append(transformer)
#     else:
#         X_tfidf = fitted_transformations[1].transform(X_cv)
    
    if (not test):
        transformer = TfidfTransformer()
        X_tfidf = transformer.fit_transform(X_hv)
        fitted_transformations.append(transformer)
    else:
        X_tfidf = fitted_transformations[2].transform(X_hv)

    # character n-grams   
    if (not test):
        transformer_chv = TfidfTransformer()
        X_tfidf_chv = transformer_chv.fit_transform(X_chv)
        fitted_transformations.append(transformer_chv)
    else:
        X_tfidf_chv = fitted_transformations[3].transform(X_chv)
    
    # create additional quantitative features

    # what does the form toxic_data['comment_text'] mean?
    toxic_data['char_count'] = toxic_data['comment_text'].str.len()
    toxic_data['Caps_count'] = toxic_data['comment_text'].str.count(r'[A-Z]')
    toxic_data['Caps_ratio'] = pow((toxic_data['Caps_count']+1)/toxic_data['char_count'], 3)
    
    toxic_data['word_count'] = toxic_data['comment_text'].str.split(' ').str.len()
    toxic_data['char_ratio'] = toxic_data['char_count']/toxic_data['word_count']
    
    toxic_data['punc_count_p'] = toxic_data['comment_text'].str.count("\.")
    toxic_data['punc_count_exc'] = toxic_data['comment_text'].str.count("\!")
    toxic_data['punc_count_q'] = toxic_data['comment_text'].str.count("\?")
    toxic_data['punc_count_c'] = toxic_data['comment_text'].str.count("\,")
    toxic_data['punc_count_a'] = toxic_data['comment_text'].str.count("\*")
    toxic_data['punc_count_s'] = toxic_data['comment_text'].str.count("\;")
    
    toxic_data['punc_count'] = toxic_data['punc_count_s'] + toxic_data['punc_count_a'] + toxic_data['punc_count_c'] + toxic_data['punc_count_p'] + toxic_data['punc_count_exc'] + toxic_data['punc_count_q']
    toxic_data['punc_ratio'] = toxic_data['punc_count']/toxic_data['word_count']
    toxic_data['punc_exc_ratio'] = (toxic_data['punc_count_exc']+0.1) / (toxic_data['punc_count']+0.1)
    toxic_data['punc_q_ratio'] = (toxic_data['punc_count_q']+0.1) / (toxic_data['punc_count']+0.1)
    
    toxic_data['Capsword_ratio'] = (toxic_data['Caps_count'] + toxic_data['punc_count'])/toxic_data['word_count']
    
    toxic_data['spaces_count'] = toxic_data['comment_text'].str.count(" ")
    toxic_data['spaces_ratio'] = toxic_data['spaces_count']/toxic_data['char_count']
    toxic_data['spaceswords_ratio'] = toxic_data['spaces_count']/toxic_data['word_count']
    
    # count the number of consecutive caps letters
    toxic_data['consCaps_count'] = toxic_data['comment_text'].str.count(r'[A-Z]{3}')
    toxic_data['consCaps_ratio'] = toxic_data['consCaps_count']/toxic_data['char_count']
    toxic_data['consCapsword_ratio'] = toxic_data['consCaps_count']/toxic_data['word_count']
    
#     toxic_data['word_count_f'] =(toxic_data['comment_text'].str.lower().str.count("fuck"))
#     toxic_data['word_count_s'] =(toxic_data['comment_text'].str.lower().str.count("shit"))
#     toxic_data['word_count_h'] =(toxic_data['comment_text'].str.lower().str.count("hell"))
    
    # current count: 22 quant features
    X_quant_features = toxic_data[["consCaps_count", "consCaps_ratio", "consCapsword_ratio", "punc_q_ratio", "punc_exc_ratio", "punc_count_s", "punc_count_a", "punc_count_c", "spaceswords_ratio", "spaces_ratio", "spaces_count", "char_count", "Caps_count", "Caps_ratio", "word_count", "char_ratio", "punc_count_p", "punc_count_exc", "punc_count_q", "punc_count", "punc_ratio", "Capsword_ratio"]]
    # for specific quant_feat
#     X_quant_features = toxic_data[["char_count"]]
    print("Look at a few rows of the new quantitative features: ")
    print(X_quant_features.head(10))
    
    # RBF sampler; what does this do?  Why is it returning only n_components as 
    # more on RBF and kernels: https://towardsdatascience.com/an-intro-to-kernels-9ff6c6a6a8dc
    # features and not n_components per feature?
#     if (not test):
#         rbf = RBFSampler(n_components=10)
#         X_rbf = rbf.fit_transform(X_tfidf)
#         print(X_rbf.shape)
#         y = toxic_data['any_toxic']
#     else:
#         X_rbf = fitted_transformations[4].transform(X_tfidf)
#         print(X_rbf.shape)       
    
#     # make rbf non-negative for Naive Bayes.  Not sure why I'm doing this, as presumably rbf is for SVM or SVC purposes.
#     if (not test):
#         mms = MinMaxScaler(feature_range=(0,1))
#         X_mms = mms.fit_transform(X_rbf)
#         y = toxic_data['any_toxic']
#     else:
#         X_mms = fitted_transformations[5].transform(X_rbf)
#         print(X_mms.shape)  

        
    # Combine all quantitative features into a single sparse matrix
    X_quant_features_csr = csr_matrix(X_quant_features)
    # hstack wo char n-grams
#     X_combined = hstack([X_tfidf, X_quant_features_csr])
    X_combined = hstack([X_tfidf, X_tfidf_chv, X_quant_features_csr])
    X_matrix = csr_matrix(X_combined) # convert to sparse matrix
    print("Size of combined bag of words and new quantitative variables matrix:")
    print(X_matrix.shape)
     
        
    # Create `X`, scaled matrix of features
    # feature scaling
    if (not test):
        sc = StandardScaler(with_mean=False)
        X = sc.fit_transform(X_matrix)
        fitted_transformations.append(sc)
        print(X.shape)
        y = toxic_data['any_toxic']
    else:
        X = fitted_transformations[4].transform(X_matrix)
        print(X.shape)
        
#     # kernel approximation is an alternative to PolyFeat (which fails due to insufficient memory)
#     # RBF sampler; what does this do?
#     if (not test):
#         rbf = RBFSampler(n_components=10)
#         X = rbf.fit_transform(X_matrix)
#         fitted_transformations.append(rbf)
#         print(X.shape)
#         y = toxic_data['any_toxic']
#     else:
#         X = fitted_transformations[4].transform(X_matrix)
#         print(X.shape)


    # Create Training and Test Sets
    # enter an integer for the random_state parameter; any integer will work
    if (test):
        X_submission_test = X
        print("Shape of X_test for submission:")
        print(X_submission_test.shape)
        print('SUCCESS!')
        return(toxic_data, X_submission_test)
    else: 
        X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = train_test_split(X, y, toxic_data, test_size=0.2, random_state=my_random_seed)
        print("Shape of X_train and X_test:")
        print(X_train.shape)
        print(X_test.shape)
        print("Shape of y_train and y_test:")
        print(y_train.shape)
        print(y_test.shape)
        print("Shape of X_raw_train and X_raw_test:")
        print(X_raw_train.shape)
        print(X_raw_test.shape)
        print('SUCCESS!')
        return(X_train, X_test, y_train, y_test, X_raw_train, X_raw_test)

# Clean Data and Create Vocabulary
#### Turns out CountVectorizer already creates a vocabulary, and HashingV creates a sparse matrix rep of vocab
#### The below is therefore irrelevant
#### It may still be important to see if there's a way to clean dataset before HashingV or with HV
#### Using Keras https://machinelearningmastery.com/deep-learning-bag-of-words-model-sentiment-analysis/

In [56]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import string
import csv

# load doc into memory
def load_doc(fn):
#     # open the file as read only
#     file = open(fn, 'r')
#     # read all text
#     text = file.read()
#     # close the file
#     file.close()
#     return text

#     with open(fn,'r', encoding="utf8") as csv_file:
#         csv_reader = csv.reader(csv_file)
#         for lines in csv_reader:
#             text = lines[1]
        
    csv_text = pd.read_csv(fn, dtype={"comment_text": "string"})
#     csv_text = pd.read_csv(fn)
    text = csv_text.comment_text
    # need 'return text'; otherwise Nonetype object means nothing is being returned???
    return text.fillna('')
#     text = text.astype(str)

# # turn a doc into clean tokens
# # don't think I need '(doc)' here
def clean_doc(doc):
    # split into tokens by white space
#     tokens = text.apply(word_tokenize)
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load the document
# fn = '/Users/Charles/Desktop/ML/A1/Test sample code/txt_sentoken/pos/cv000_29590.txt'
fn = '/Users/Charles/Desktop/ML/A1/toxiccomments_train_for testing.csv'
text = load_doc(fn)
# print(text.head(5))
tokenized = [clean_doc(i) for i in text]
# tokens = clean_doc(text)
# tokens = text.apply(clean_doc(text), axis=1, raw=True)
print(tokenized)

[['Explanation', 'Why', 'edits', 'made', 'username', 'Hardcore', 'Metallica', 'Fan', 'reverted', 'They', 'werent', 'vandalisms', 'closure', 'GAs', 'voted', 'New', 'York', 'Dolls', 'FAC', 'And', 'please', 'dont', 'remove', 'template', 'talk', 'page', 'since', 'Im', 'retired'], ['Daww', 'He', 'matches', 'background', 'colour', 'Im', 'seemingly', 'stuck', 'Thanks', 'talk', 'January', 'UTC'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], 

In [None]:
# to fill NAN (blank) cells with empty string
# https://stackoverflow.com/questions/10867028/get-pandas-read-csv-to-read-empty-values-as-empty-string-instead-of-nan

# apply function to pandas dataframe
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

# specifying data types when using pd.read_csv
# https://stackoverflow.com/questions/24251219/pandas-read-csv-low-memory-and-dtype-options
# https://stackoverflow.com/questions/53527230/python-pandas-nltk-tokenize-column-in-pandas-dataframe-expected-string-or-bytes

# to process list of lists in certain functions, need to use for loops 
# https://stackoverflow.com/questions/50048472/attributeerror-list-object-has-no-attribute-translate-while-removing-punctu

In [None]:
f_in = open('/Users/Charles/Desktop/ML/A1/toxiccomments_train.csv','r', encoding="utf8")
for line in f_in.readlines():
    print(line)
f_in.close()

In [None]:
import csv

# using a 'with' statement does not require opening and closing file, which if neglected could impair code
# https://www.geeksforgeeks.org/with-statement-in-python/
# https://www.foxinfotech.in/2018/09/python-read-csv-columns-into-list.html

with open('/Users/Charles/Desktop/ML/A1/toxiccomments_train.csv','r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for lines in csv_reader:
        print(lines[1])

In [32]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import csv
import pandas as pd

fn = '/Users/Charles/Desktop/ML/A1/toxiccomments_train.csv'

td = pd.read_csv(fn)
t = td.comment_text

t.apply(word_tokenize)

0         [Explanation, Why, the, edits, made, under, my...
1         [D'aww, !, He, matches, this, background, colo...
2         [Hey, man, ,, I, 'm, really, not, trying, to, ...
3         [``, More, I, ca, n't, make, any, real, sugges...
4         [You, ,, sir, ,, are, my, hero, ., Any, chance...
                                ...                        
159566    [``, :, :, :, :, :, And, for, the, second, tim...
159567    [You, should, be, ashamed, of, yourself, That,...
159568    [Spitzer, Umm, ,, theres, no, actual, article,...
159569    [And, it, looks, like, it, was, actually, you,...
159570    [``, And, ..., I, really, do, n't, think, you,...
Name: comment_text, Length: 159571, dtype: object

In [27]:
import csv

fn = '/Users/Charles/Desktop/ML/A1/toxiccomments_train.csv'

with open(fn,'r', encoding="utf8") as csv_file:
    csv_reader = csv.reader(csv_file)
    for lines in csv_reader:
        text = lines[1]
        print(text[1])

o
x
'
e


o


O
o
o
l


b
e
e
h


y
E
h
o










a
h
e


=
o
o
r
 






r
i
o


F
o
U
'
 
o
n


:


E
r
h
w
t
T


y
h
o
 
E




l


 
h




 
h




a
h
n
 
i




c
p


a
o
h
e
h
e
r


o
A




o
h
o
o
h
 


h
 
 


'
e
n




h
a
l
i
l






o
i
h
=
e
h


.
h
o
C
e
 
e
h
e




M
6
e


P
h


z
s


r




h






 


n


=




r
h


n
i
o
t


o
r
h
o
k
 
6
o
E


o
)
=
 


o


e


b
 


b
o
h
e
 
o
l


o


 


'
i
n
 
a
 
u
t




e
i
n
i
e
e


:
 
l


'


t


 
u
e
a


l
e
 
 
t
=
l
a
o
i
t
h
 
 


o


 
o
l


*
o
a
 


#


i
d
 
e
o
t


o
h
e
 
 


l
 
e
 
h
0
e
e
:
e


h


'
n
r
 
e
 
h
t


u


e


i
t
'


i
h
p
U






o
e
e
 


e
h
v


h
o
A
o
f
E
h
t
o
e


n


a


h
H
R


e
h


u


k
o


a


u


h


h


'
'
k


i
P
 
r
o
a


a
A


P
p




O
E
P
r
h


E


o
h
o




 
s
h
p
e
"


f


n




e




a
i


'


n








i
u
h


o
c
e


e
a
e






 
 
a


o
e
x
i
e


 
a
u
h
:
i
l
e
h
h
U
n


i
d
m
0




 


u
,


0
u
p
e
h
s
t
n


o
t
p
a
h
e
*
s
h
r
a


n
e
=


r
a




 
i
t
o
h
a
e
"
n


r
i
r
o


i
i
 


W
o




o
o
e
o
 
a
l
a






k
:
m
J
h
h








o


h
l
r




9
 
}
a
.
h


n
h
o
o
:
o
O
 


 






h






u
f




 
o


s




 


w
i
:


i
u


o




h
l
u
h
t
l
a
h
s
l
 
y




e
o
i
e
e
l
s
h




h
o
e
F
U


e
o


a
h


 
o
h
 
 
i


t
:
n
e
h
o
h
h


Y
h
 
h
c
h
a
e
:
h
n
o




 
o
a
o


n
M
h


u
h


 
a
 
'
I
 


e


e


h
h
 
W
h


o


 


h
I
w


e
o
A
 


m
E
o
 
s


m


 
'
i
h
:
N
o
e
K
E
i
i
 
h
 
 
e
.
m
i


h
o
h
l
e


e
b


e
o
o
r
 
o
o
B
R
o
I


t






o




h
i
e
h
n
 
e
 




 
e


e
o
i


e
a
o
h
e


a
,
d
s
i
n
e
m
 


e


k
o
r
o


h
S


m
i
o


l
o


m


:
h
t
i
e


'
 
X
e
u




o




o






e
T


4
f
d
h
o
c
:
h
u
h
a


o
a
o


e
a
e


n


l
T
n
h
o
N
y
 
n
n


 


u
s
 


r


o
M
r
r
h
n
e
o
s
e
:
'


E
i
 
'
v
I


E
F


h
h
h
 
a
a
 
i
O


e






i
 
u
a
h
:
h
r
o


 
h
h
h
e




i
a
h
 
h
U
p
o
 


 


t




e
O


e
f
o
e
S


a






e
y


o
r


H




p
h
a
o


h


E
r
b
p
o
o
a


l
I
]


h
o


e
a
s
e
 
e
t
P


'
x
x


 
n


 
o
f


]


i






e
 
o
A
e
i
i
I
o
h
h


h


s








 
4
k
f


o
l


n
n
e
S
e
i
e






e
'
o
O
h
u




a
:
e
 




 
f
 
a
h
o
e
 
 
h
i
:
 


h
h
h
k


e
o


i
 
i
A
y
o
f
 
h
a
l
o
w




e
o
o


o
e
@
o
o
i
s
e
o
l
o
h
O
h
e
B


o
o
A
o
E






r
o
h
e
h
h
s
e


h
n


e
u
n


H


h


o
h
r






i


o
t
l
r






o
h
h
E


e
e
e
a


T




'


A
e
O


h
o






t
c
t
i
,
t
e


o
l
h
e
 
h
M




o




P


s
i
n
n
o
o
a


o
h


 
K
e
E
h
:




h






T
a
i
h
o


p
h
u
h
m
e




h
h
e
e
h
 
d
.
i
i


O
l




p
e
e
l
t


h
h


T


u
 


c


e


m


e
R
h
h
e
h
l
l
E
o
 
a
 


'


h






a


a
f
H


o
a
h
o
:
i
 
p
o
o
i




l
a


h






i
y


 


B




h
u


b
a
 
 


e
 


e
'


h
 
e
r
o


e




'


 
h


h
f
 


r
a




o
a


l
r


i




i
r
s




r


r
=
s
o
I
i
h
n


E
m
e
u
m
h
i
h




'
u
h
 
'


o
'


x
U
I


 
h




h
e
h
t
 


a
g
h


a
h
p


t
h
h
 
o
u
U
 


r


e
o
E
M
u
 
h
I


a
r
h
e
 


'
'
 
h
 
o
:
K
e
h


a
O




h
y


o
h


 
n




a
T
e
4
U
o
i


e
t


=


o
 
i
n


w
 




0
e
t
u
h


h
.
n
h
h
1


h
 
a
o






e
h


f
T
i


m
v
f


o
t
e
n
u
I
a
h
:
 




i
h


s
o
 
o
U
K






h


a
o


{


V


o
a
I


t
h
f
=
h


:


d
o
d


g




p


 
a
t
o
 








i
h
a
a


e


U
u
a
w
i
h
n
a


 


l
m
 
 
:
 
h
a


 
h
o
l
l
 
t
o
e
i
i
{
o


e
 
o


f
9


o
o
A
s
h
s
:
 
o
t
9
|
o
n






U


h
 
v
'
e
e


u


h
W
i
l
 
t
o


e
n
h
a
x
t
t




h
o
c
h
e
 


u
h
 
r
 
 
o
e
 
f






a
h


O


 
'
i


l
n
T




a
s
e
 
e
d
a












e
o
i


E
t
x


o


h
e


h
f


f
u
f






p




n
A


o
r
 


u
u
o
o
e


o
o
t
n


L
 
e




 
 
2
S
E
o




h
o


o
o


e
,


r
o
S
 




o
o
e




:


t
f
e
 
h


h
H
 
E
t
o




*
o




n
o
h
0
O
a


a
t
S
 




o


p
a
a


o
D
l
o
A
=
o




p
 


e
e
 


o
u


a
t
K
i
h
i


l






a
e
n


e
h
e
r
o
 
l
l
 
n
k


n
k
o
i
a




e
h
m
o


o
i
n


e


e
E
m
 
I
e
l
 
i
l
Y


S






n
Y
e


T




o
 






o
h


i
 
e


 
 
 
e


T
E
T


a
o
o
u
e
.
i
o
e
f
k
e
t
l
 
S


o
l
o
o
'
n
P
L
h
o


h




 
c
t
u
o
o
l
t


 
o
o
t
r




n
h
l
h
h
 


h




o
p


i


a


 
h
e


r
e
:
 
Y
_
=


o
n
e
r


h
o
u


e
 
t
B


 




o
 
n
I
T
i
o
i


 
h


i
h
f
I


E
e
a
 
 


e


e
u
 
h
 




f
e
=
e
i
O


u
h








e
o
a
h
p
u
o
a


f
a
u
a
u
h










:


F
e
e
m
O
_


S
T
c


h
I
l
l


o


o
o
r


f
H
e
o
 
u


o
o


 
a
h


 
=
o
e
 
S
 
y
u
m


h
t


A


 
s
 


 
o
 
t
P
i




H
 
h


h
’




,


S
a
h
e


r


e
'
t


e


 


o
'
m
U
O
 


 
e
s
o
 
e


[


o
h


d
A
l
h


 




o
i
o
I
h
d


K
h
~


 
o


h


 
t
n
u
 
a
l


 
'
e
 
 
l
t




m
 
I
 


r
o


h


o


a
e
o
:
:




o
i
I
 
t
h
r
a






e
=
r
 




x
o


h
f
 
i
h


C


O
I
l


g
e
 
o
g


e
a
u
p


u
h
A
 


 
 
O
 


1
u
u
l
c
e
m
e


h
e


P


l








i


a


d
s
D
r
 
 
o
o
n
e
=
o
 
i
'
o


e


t
i
e
h
a
 
k
l
i
o


h


e




=
h


t
h
E
:
a
a
p
a




o
l
e


h


n




p


i
e
o


h
e
'
 
e
 
o
 
e
o
4








o


e
h






 
e
h


l
o
 
.


n
u
o
i
o
e
h




e
o


t


 




e


d
o
x


E


o
 
i
a
P
u
O
o
h
E
E


 
'


p
p


e


u
l
e


u




o
o
o


a


c




.
o
y
h
a


h
e


 
h
 
O
 








 


h


d
 
e


h


k
h
o
h
4
s
o
o
 


o




a
h
h
P
r
m
n


e


 




t
h
:
n
o
i
'
u
=
o
 
S
w
h
n


o
t


t
h
a
h
'
n
h
o
x






h




h




h
i
r


a
o
e
e
c
l
K
 
s




e




 
i


 


n


 
o


e
l


:
 
U
h
.
o
A
l




 






 




u
 
I
E
 


i


e
"
a
m


 


a


h
u
r
i
m
a
h
p


 
h
0
e
h


e
A
 
o


:


n
p


i
e
h
 


o
e
h
e
|
o
 
o
h
W




:


o
t
a
e
 
e


u
n
i
k


e
t
l


o
h
o
o






a
O
I
h


=


 
=
e
o


l
a
o
h
n






h
 
e
e
 
e
i
i
o
u
e
e
 
 




 
u




h
h
h








 
a
r
l
W
o
i
0
o
s
t






E
e




s
T


h
h


f
e
h


e
f
e
o
t


 
a
a
e
h
o
o
l
t
=
f
h
l








o


f


C
m
=
w
e


 




 
o
d
t
O
P
r




o


 
P
n


y
o
o
 
a
O
e


f
O
 




u
y
u


e
h
o


o
:
e
h
a
f


i
a
l
u
>




p
i


e
:
u
e
h


h
i




o
o
n
r




o


E
0
 




t
e


 
'
 
y


o
a
A
h
 




h
h
i


l
e
h
h






d


 
h
c
i
A


A
 
 


t


o
i






l


h
v
o
e


:


=
n
h


o


i




l


 
a
h
C
i


s



o




e
i
o
E
'
h
h
a
n


 
f
h
[
a


e
o
G
 
 
y
I
o




e
n
b


e
 
 
C


l
l
I
p
 
A
 
'
 


p
e
o
e
d
H
n
a
P
a


h
:




k
e
O
,
e
a
h
l
 


"
s
n
O
e
o
e
n
o


a
h




o
r
e
h
t


=
o


o
e
*


e
e
P
y
h
l
e
N
 
h
t
.


H
h
o
 


 
r


t
t
o
 
g


i
r
 
 
a
 
t


~
l
d




o
}
 


o
e
h
=
a
E
s
o
a
I
i
r


 


a
 
o


o
r
 
h




P
h




f
H
o
 
B




N
O


:
O
 


 
h






u


y
H




`
x
 




l


o
 
U
e
e
h
o
t
E
W
t
h
 
e
y
a
c
 




h
n
u


o


V


i
e
f


l
,
o
n
f


u
h
h


o
5
 
t


r


e
 
1
7
e


H
a
h


i
 
a
 


h
p


L


f


U




o




:
o
m


o
o
i
h


O
h
a
 
o
 
o


o
I
e
o
o
 
U
 
h
n
 


h
o




O
o
g
 


h


g
u


s
4
o
l
o
 
d
h




'
t


o
x
u


h
u
e
t
0
 
 
r
o
 
t


e
 
h
k
e
a
h
O
A
"
:
 
 
 
b
f
,
 
r




o
l
ƒ
o
h
a
r
h


h
c
i
o
e
u




2
e
:




,


r
 
t
l
l


 
o
i
E


e
o
o
A


m
e
L
 
l




e
o
i
T
m
 


e
d
y
o
h


a
 
 
h
n


o


h
m


n
i
e
m
r
u
'
s
=
h


a
h
i
e
=
o
h


o
a


h
i
i
l
 


O


e
o


s
:
r
r
e
o


p


 


 


o
i
 
 
O
t
o
o

h
h


o
e
e
a
i
n


:




m
y
i
n
h
n
t
:
O
u
h
e
a
a
e


h
A
o






t
.


h


h
h
 
B


 
h
i
e
i
}
m
o
 
=


e
{
 


o
 


n


r


o


i
 


'




t


h
n
 
 


e
h


y




*
y
 
0
T
l
h
n
h
'
t




*
h
a
5
h
h
o






s
h
h
.
o


 


o
u
o
u
e
e
i
a


l
o




 


u
e
W
A
 
'
o




n




i
i
h


d
 
h
n


h
e




E
:
h
e
o
e
N


m
a
p


|




e


l
 
h




 


u




o
i
k
=
h
 




i
e


m
E


d
f
I
o
o
h
a


i
a
a
i




e


 
 
e




f


7
h
o


s


h
o
n
e








1
 
r


o
O
s
E
u
e
s
 




r
h
 
.


i


e
i


f
o


r
 
t
e
o
o
U
l
c
:


n
o


e
t
 
:
u


 
u
o
X
h
n
I
u
o
o
e
 
 










6
p
o




n


u
e
i
E
u
e
,
 
.
s
o




u
i
r
y
y
e
a
t
h
h
E
o


o






m
i
e
o
 
'
u
o
h
n
:
:
r
l
u
h
u
o
i
t


 






u
i
n


U
e
h




 
 
p
t
l
.
h
o
n


F
r
 
=
b
a
o
 


 
h
=
n


i




o
e




T
 
:




e




n
i
f
e
e
h
T
A




u
i
c


(


t
o
=
t
h
y




'
o




e


i
e
o
h
p
r
a
r
x


r
o
 


o
 


c
t
o
o
n
u




y
t
O
h


i


h
C


'
f
h


i


l


r
a
a
e
o
f
e
h




W
w
T
a
a


KeyboardInterrupt: 

# Create training and test sets from function

In [3]:
# create an empty list to store any use of fit_transform() to transform() later
# it is a global list to store model and feature extraction fits
fitted_transformations = []

# CHANGE FILE PATH and my_random_seed number (any integer other than 74 will do): 
X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = process_raw_data(fn='/Users/Charles/Desktop/ML/A1/toxiccomments_train.csv', my_random_seed=36)

print("Number of fits stored in `fitted_transformations` list: ")
print(len(fitted_transformations))

toxic_data is: <class 'pandas.core.frame.DataFrame'>
toxic_data has 159571 rows and 9 columns 

the data types for each of the columns in toxic_data:
id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
any_toxic          bool
dtype: object 

the first 5 rows in toxic_data:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  any_toxic  
0             0        0      

# Fit (and tune) Various Models

### MODEL: SGD Classifier experiments

In [87]:
from sklearn import linear_model
sgd = linear_model.SGDClassifier(loss='perceptron', learning_rate='invscaling', eta0=0.00000000000001, average=True)
sgd.fit(X_train, y_train)

sgd_performance_train = BinaryClassificationPerformance(sgd.predict(X_train), y_train, 'sgd_train')
sgd_performance_train.compute_measures()
print(sgd_performance_train.performance_measures)

{'Pos': 12951, 'Neg': 114705, 'TP': 10943, 'TN': 113201, 'FP': 1504, 'FN': 2008, 'Accuracy': 0.9724885630130977, 'Precision': 0.8791676709247208, 'Recall': 0.8449540576017296, 'desc': 'sgd_train'}


In [88]:
# sgd quick test
sgd_performance_test = BinaryClassificationPerformance(sgd.predict(X_test), y_test, 'sgd_test')
sgd_performance_test.compute_measures()
print(sgd_performance_test.performance_measures)
print("True Positive Rate:")
print(sgd_performance_test.performance_measures['TP'] / sgd_performance_test.performance_measures['Pos'] * 100)
print("False Positive Rate:")
print(sgd_performance_test.performance_measures['FP'] / sgd_performance_test.performance_measures['Neg'] * 100)
print("False Negative Rate:")
print(sgd_performance_test.performance_measures['FN'] / sgd_performance_test.performance_measures['Pos'] * 100)

{'Pos': 3274, 'Neg': 28641, 'TP': 2373, 'TN': 27301, 'FP': 1340, 'FN': 901, 'Accuracy': 0.9297822340592198, 'Precision': 0.6391058443307299, 'Recall': 0.724801466096518, 'desc': 'sgd_test'}
True Positive Rate:
72.4801466096518
False Positive Rate:
4.678607590517091
False Negative Rate:
27.5198533903482


### MODEL: ordinary least squares

In [None]:
from sklearn import linear_model
ols = linear_model.SGDClassifier(loss="squared_loss")
ols.fit(X_train, y_train)

ols_performance_train = BinaryClassificationPerformance(ols.predict(X_train), y_train, 'ols_train')
ols_performance_train.compute_measures()
print(ols_performance_train.performance_measures)
print("False Positive Rate:")
print(ols_performance_train.performance_measures['FP'] / ols_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(ols_performance_train.performance_measures['TP'] / ols_performance_train.performance_measures['Pos'])

### MODEL: SVM, linear

In [44]:
from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(X_train, y_train)

svm_performance_train = BinaryClassificationPerformance(svm.predict(X_train), y_train, 'svm_train')
svm_performance_train.compute_measures()
print(svm_performance_train.performance_measures)
print("False Positive Rate:")
print(svm_performance_train.performance_measures['FP'] / svm_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(svm_performance_train.performance_measures['TP'] / svm_performance_train.performance_measures['Pos'])

{'Pos': 12951, 'Neg': 114705, 'TP': 12930, 'TN': 114603, 'FP': 102, 'FN': 21, 'Accuracy': 0.9990364730212445, 'Precision': 0.9921731123388582, 'Recall': 0.9983785035904563, 'desc': 'svm_train'}
False Positive Rate:
0.0008892376095200733
True Positive Rate:
0.9983785035904563


### MODEL: SVC

In [45]:
from sklearn.svm import LinearSVC
svc_0 = LinearSVC(C=1.0)
svc_0.fit(X_train, y_train)

svc_performance_train_0 = BinaryClassificationPerformance(svc_0.predict(X_train), y_train, 'svc_train_0')
svc_performance_train_0.compute_measures()
print(svc_performance_train_0.performance_measures)
print("False Positive Rate:")
print(svc_performance_train_0.performance_measures['FP'] / svc_performance_train_0.performance_measures['Neg'])
print("True Positive Rate:")
print(svc_performance_train_0.performance_measures['TP'] / svc_performance_train_0.performance_measures['Pos'])



{'Pos': 12951, 'Neg': 114705, 'TP': 12930, 'TN': 114534, 'FP': 171, 'FN': 21, 'Accuracy': 0.9984959578868209, 'Precision': 0.986947561254866, 'Recall': 0.9983785035904563, 'desc': 'svc_train_0'}
False Positive Rate:
0.001490780698313064
True Positive Rate:
0.9983785035904563


### MODEL: logistic regression

In [None]:
from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log')
lgs.fit(X_train, y_train)

lgs_performance_train = BinaryClassificationPerformance(lgs.predict(X_train), y_train, 'lgs_train')
lgs_performance_train.compute_measures()
print(lgs_performance_train.performance_measures)
print("False Positive Rate:")
print(lgs_performance_train.performance_measures['FP'] / lgs_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(lgs_performance_train.performance_measures['TP'] / lgs_performance_train.performance_measures['Pos'])

### MODEL: Naive Bayes

In [71]:
from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(X_train, y_train)

nbs_performance_train = BinaryClassificationPerformance(nbs.predict(X_train), y_train, 'nbs_train')
nbs_performance_train.compute_measures()
print(nbs_performance_train.performance_measures)
print("False Positive Rate:")
print(nbs_performance_train.performance_measures['FP'] / nbs_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(nbs_performance_train.performance_measures['TP'] / nbs_performance_train.performance_measures['Pos'])

{'Pos': 12951, 'Neg': 114705, 'TP': 11336, 'TN': 103432, 'FP': 11273, 'FN': 1615, 'Accuracy': 0.8990411731528483, 'Precision': 0.5013932504754743, 'Recall': 0.8752992046946182, 'desc': 'nbs_train'}
False Positive Rate:
0.09827819188352731
True Positive Rate:
0.8752992046946182


In [54]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

gnb_performance_train = BinaryClassificationPerformance(gnb.predict(X_train), y_train, 'gnb_train')
gnb_performance_train.compute_measures()
print(gnb_performance_train.performance_measures)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [29]:
# Complement Naive Bayes
from sklearn.naive_bayes import ComplementNB
cnb = ComplementNB(alpha=36, norm=True)
cnb.fit(X_train, y_train)

cnb_performance_train = BinaryClassificationPerformance(cnb.predict(X_train), y_train, 'cnb_train')
cnb_performance_train.compute_measures()
print(cnb_performance_train.performance_measures)
print("False Positive Rate:")
print(cnb_performance_train.performance_measures['FP'] / cnb_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(cnb_performance_train.performance_measures['TP'] / cnb_performance_train.performance_measures['Pos'])

{'Pos': 12951, 'Neg': 114705, 'TP': 12312, 'TN': 92500, 'FP': 22205, 'FN': 639, 'Accuracy': 0.8210503227423701, 'Precision': 0.3566938030535678, 'Recall': 0.9506601806810285, 'desc': 'cnb_train'}
False Positive Rate:
0.1935835403862081
True Positive Rate:
0.9506601806810285


### MODEL: Alternative Models

In [34]:
# K Neighbors Classifier
# too long runtime
from sklearn import neighbors
knb = neighbors.KNeighborsClassifier(n_neighbors=2)
knb.fit(X_train, y_train)

knb_performance_train = BinaryClassificationPerformance(knb.predict(X_train), y_train, 'knb_train')
knb_performance_train.compute_measures()
print(knb_performance_train.performance_measures)

KeyboardInterrupt: 

In [30]:
# Passive Aggressive Classifier
from sklearn import linear.model
pac = linear_model.PassiveAggressiveClassifier(loss='squared_hinge', C=0.000001)
pac.fit(X_train, y_train)

pac_performance_train = BinaryClassificationPerformance(pac.predict(X_train), y_train, 'pac_train')
pac_performance_train.compute_measures()
print(pac_performance_train.performance_measures)
print("False Positive Rate:")
print(pac_performance_train.performance_measures['FP'] / pac_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(pac_performance_train.performance_measures['TP'] / pac_performance_train.performance_measures['Pos'])

{'Pos': 12951, 'Neg': 114705, 'TP': 12344, 'TN': 114565, 'FP': 140, 'FN': 607, 'Accuracy': 0.9941483361534122, 'Precision': 0.9887856456264018, 'Recall': 0.9531310323527141, 'desc': 'pac_train'}
False Positive Rate:
0.0012205222091451985
True Positive Rate:
0.9531310323527141


In [31]:
# PAC quick test
pac_performance_test = BinaryClassificationPerformance(pac.predict(X_test), y_test, 'pac_test')
pac_performance_test.compute_measures()
print(pac_performance_test.performance_measures)
print("False Positive Rate:")
print(pac_performance_test.performance_measures['FP'] / pac_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(pac_performance_test.performance_measures['TP'] / pac_performance_test.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2333, 'TN': 28193, 'FP': 448, 'FN': 941, 'Accuracy': 0.9564781450728498, 'Precision': 0.8389068680330817, 'Recall': 0.7125839951130116, 'desc': 'pac_test'}
False Positive Rate:
0.01564191194441535
True Positive Rate:
0.7125839951130116


### MODEL: Perceptron

In [93]:
# multilayer perceptron
# more on MLP: https://towardsdatascience.com/deep-neural-multilayer-perceptron-mlp-with-scikit-learn-2698e77155e#_=_
from sklearn import neural_network
mlp = neural_network.MLPClassifier(activation='identity', hidden_layer_sizes=(10,10,10))
mlp.fit(X_train, y_train)

mlp_performance_train = BinaryClassificationPerformance(mlp.predict(X_train), y_train, 'mlp_train')
mlp_performance_train.compute_measures()
print(mlp_performance_train.performance_measures)



{'Pos': 12951, 'Neg': 114705, 'TP': 12736, 'TN': 114653, 'FP': 52, 'FN': 215, 'Accuracy': 0.9979084414363603, 'Precision': 0.9959336878323428, 'Recall': 0.9833989653308625, 'desc': 'mlp_train'}


In [91]:
# mlp quick test
mlp_performance_test = BinaryClassificationPerformance(mlp.predict(X_test), y_test, 'mlp_test')
mlp_performance_test.compute_measures()
print(mlp_performance_test.performance_measures)
print("False Positive Rate:")
print(mlp_performance_test.performance_measures['FP'] / mlp_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(mlp_performance_test.performance_measures['TP'] / mlp_performance_test.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2267, 'TN': 28037, 'FP': 604, 'FN': 1007, 'Accuracy': 0.9495221682594391, 'Precision': 0.7896203413444792, 'Recall': 0.692425167990226, 'desc': 'mlp_test'}
False Positive Rate:
0.021088649139345692
True Positive Rate:
0.692425167990226


In [8]:
# direct linear_model.perceptron
from sklearn import linear_model
pcp = linear_model.Perceptron(penalty='l1', alpha=.00001)
pcp.fit(X_train, y_train)

pcp_performance_train = BinaryClassificationPerformance(pcp.predict(X_train), y_train, 'pcp_train')
pcp_performance_train.compute_measures()
print(pcp_performance_train.performance_measures)
print("False Positive Rate:")
print(pcp_performance_train.performance_measures['FP'] / pcp_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(pcp_performance_train.performance_measures['TP'] / pcp_performance_train.performance_measures['Pos'])

{'Pos': 12951, 'Neg': 114705, 'TP': 12918, 'TN': 114626, 'FP': 79, 'FN': 33, 'Accuracy': 0.9991226421006455, 'Precision': 0.9939216742325152, 'Recall': 0.9974519342135743, 'desc': 'pcp_train'}
False Positive Rate:
0.0006887232465890763
True Positive Rate:
0.9974519342135743


In [5]:
from sklearn import linear_model
prc = linear_model.SGDClassifier(loss='perceptron')
prc.fit(X_train, y_train)

prc_performance_train = BinaryClassificationPerformance(prc.predict(X_train), y_train, 'prc_train')
prc_performance_train.compute_measures()
print(prc_performance_train.performance_measures)
print("False Positive Rate:")
print(prc_performance_train.performance_measures['FP'] / prc_performance_train.performance_measures['Neg'])
print("True Positive Rate:")
print(prc_performance_train.performance_measures['TP'] / prc_performance_train.performance_measures['Pos'])

{'Pos': 12951, 'Neg': 114705, 'TP': 12896, 'TN': 114681, 'FP': 24, 'FN': 55, 'Accuracy': 0.9993811493388481, 'Precision': 0.9981424148606811, 'Recall': 0.9957532236892904, 'desc': 'prc_train'}
False Positive Rate:
0.00020923237871060547
True Positive Rate:
0.9957532236892904


### MODEL: Ridge Regression Classifier

In [None]:
from sklearn import linear_model
rdg = linear_model.RidgeClassifier(alpha=1.0)
rdg.fit(X_train, y_train)

rdg_performance_train = BinaryClassificationPerformance(rdg.predict(X_train), y_train, 'tr0')
rdg_performance_train.compute_measures()
print(rdg_performance_train.performance_measures)

### MODEL: Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rdf_60 = RandomForestClassifier(max_depth=60, random_state=0)
# manipulate max_depth
rdf_60.fit(X_train, y_train)

rdf_performance_train_60 = BinaryClassificationPerformance(rdf_60.predict(X_train), y_train, 'rdf_train_60')
rdf_performance_train_60.compute_measures()
print(rdf_performance_train_60.performance_measures)

### ROC plot to compare performance of various models and fits

In [None]:
fits = [ols_performance_train, svm_performance_train, lgs_performance_train, nbs_performance_train, prc_performance_train]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, 1, 0, 1])
plt.title('ROC plot: training set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()

### looking at reviews based on their classification

Let's say we decide that Ordinary Least Squares (OLS) Regression is the best model for generalization. Let's take a look at some of the reviews and try to make a (subjective) determination of whether it's generalizing well. 

### let's look at some examples of each classification:

In [None]:
# false positives

print("Examples of false positives:")

import random, time

for i in range(0, len(nbs_predictions)):
    if (nbs_predictions[i] == 1):
        if (X_raw_train.iloc[i]['any_toxic'] == 0):
            if (random.uniform(0, 1) < 0.05): # to print only 5% of the false positives
                print(i)
                print(X_raw_train.iloc[i]['comment_text'])
                print('* * * * * * * * * ')


In [None]:
# true positives

print("Examples of true positives:")

import random, time

for i in range(0, len(nbs_predictions)):
    if (ols_predictions[i] == 1):
        if (X_raw_train.iloc[i]['any_toxic'] == 1):
            if (random.uniform(0, 1) < 0.05): # to print only 5% of the false positives
                print(i)
                print(X_raw_train.iloc[i]['comment_text'])
                print('* * * * * * * * * ')


In [None]:
# false negatives

print("Examples of false negatives:")

import random, time

for i in range(0, len(nbs_predictions)):
    if (nbs_predictions[i] == 0):
    # model predicts negative
        if (X_raw_train.iloc[i]['any_toxic'] == 1):
        # but training data says should have been positive; thus, false negative
            if (random.uniform(0, 1) < 0.05): # to print only 5% of the false positives
                print(i)
                print(X_raw_train.iloc[i]['comment_text'])
                print('* * * * * * * * * ')


---

# <span style="color:red">WARNING: Don't look at test set performance too much!</span>

---

The following cells show performance on your test set. Do not look at this too often! 

# Look at performance on the test set

### MODEL: ordinary least squares

In [None]:
sgd_performance_test = BinaryClassificationPerformance(sgd.predict(X_test), y_test, 'sgd_test')
sgd_performance_test.compute_measures()
print(sgd_performance_test.performance_measures)
print("True Positive Rate:")
print(sgd_performance_test.performance_measures['TP'] / sgd_performance_test.performance_measures['Pos'] * 100)
print("False Positive Rate:")
print(sgd_performance_test.performance_measures['FP'] / sgd_performance_test.performance_measures['Neg'] * 100)
print("False Negative Rate:")
print(sgd_performance_test.performance_measures['FN'] / sgd_performance_test.performance_measures['Pos'] * 100)

### MODEL: ordinary least squares

In [None]:
ols_performance_test = BinaryClassificationPerformance(ols.predict(X_test), y_test, 'ols_test')
ols_performance_test.compute_measures()
print(ols_performance_test.performance_measures)
print("False Positive Rate:")
print(ols_performance_test.performance_measures['FP'] / ols_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(ols_performance_test.performance_measures['TP'] / ols_performance_test.performance_measures['Pos'])

### MODEL: SVM, linear

In [46]:
svm_performance_test = BinaryClassificationPerformance(svm.predict(X_test), y_test, 'svm_test')
svm_performance_test.compute_measures()
print(svm_performance_test.performance_measures)
print("False Positive Rate:")
print(svm_performance_test.performance_measures['FP'] / svm_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(svm_performance_test.performance_measures['TP'] / svm_performance_test.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2385, 'TN': 27685, 'FP': 956, 'FN': 889, 'Accuracy': 0.9421901926993577, 'Precision': 0.7138581263094882, 'Recall': 0.7284667073915699, 'desc': 'svm_test'}
False Positive Rate:
0.03337872280995775
True Positive Rate:
0.7284667073915699


### MODEL: SVC

In [47]:
svc_performance_test_0 = BinaryClassificationPerformance(svc_0.predict(X_test), y_test, 'svc_test_0')
svc_performance_test_0.compute_measures()
print(svc_performance_test_0.performance_measures)
print("False Positive Rate:")
print(svc_performance_test_0.performance_measures['FP'] / svc_performance_test_0.performance_measures['Neg'])
print("True Positive Rate:")
print(svc_performance_test_0.performance_measures['TP'] / svc_performance_test_0.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2339, 'TN': 27547, 'FP': 1094, 'FN': 935, 'Accuracy': 0.936424878583738, 'Precision': 0.6813282842994466, 'Recall': 0.7144166157605376, 'desc': 'svc_test_0'}
False Positive Rate:
0.03819699032854998
True Positive Rate:
0.7144166157605376


### MODEL: logistic regression

In [None]:
lgs_performance_test = BinaryClassificationPerformance(lgs.predict(X_test), y_test, 'lgs_test')
lgs_performance_test.compute_measures()
print(lgs_performance_test.performance_measures)
print("False Positive Rate:")
print(lgs_performance_test.performance_measures['FP'] / lgs_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(lgs_performance_test.performance_measures['TP'] / lgs_performance_test.performance_measures['Pos'])

### MODEL: Naive Bayes

In [73]:
nbs_performance_test = BinaryClassificationPerformance(nbs.predict(X_test), y_test, 'nbs_test')
nbs_performance_test.compute_measures()
print(nbs_performance_test.performance_measures)
print("True Positive Rate:")
print(nbs_performance_test.performance_measures['TP'] / nbs_performance_test.performance_measures['Pos'] * 100)
print("False Positive Rate:")
print(nbs_performance_test.performance_measures['FP'] / nbs_performance_test.performance_measures['Neg'] * 100)
print("False Negative Rate:")
print(nbs_performance_test.performance_measures['FN'] / nbs_performance_test.performance_measures['Pos'] * 100)

{'Pos': 3274, 'Neg': 28641, 'TP': 2705, 'TN': 25720, 'FP': 2921, 'FN': 569, 'Accuracy': 0.8906470311765627, 'Precision': 0.48080341272662636, 'Recall': 0.8262064752596212, 'desc': 'nbs_test'}
True Positive Rate:
82.62064752596213
False Positive Rate:
10.198666247686884
False Negative Rate:
17.379352474037873


In [30]:
# Complement NB
cnb_performance_test = BinaryClassificationPerformance(cnb.predict(X_test), y_test, 'cnb_test')
cnb_performance_test.compute_measures()
print(cnb_performance_test.performance_measures)
print("True Positive Rate:")
print(cnb_performance_test.performance_measures['TP'] / cnb_performance_test.performance_measures['Pos'] * 100)
print("False Positive Rate:")
print(cnb_performance_test.performance_measures['FP'] / cnb_performance_test.performance_measures['Neg'] * 100)
print("False Negative Rate:")
print(cnb_performance_test.performance_measures['FN'] / cnb_performance_test.performance_measures['Pos'] * 100)

{'Pos': 3274, 'Neg': 28641, 'TP': 3022, 'TN': 22886, 'FP': 5755, 'FN': 252, 'Accuracy': 0.8117812940623531, 'Precision': 0.3443089894041244, 'Recall': 0.9230299328039095, 'desc': 'cnb_test'}
True Positive Rate:
92.30299328039095
False Positive Rate:
20.093572151810342
False Negative Rate:
7.697006719609041


### MODEL: Perceptron

In [87]:
mlp_performance_test = BinaryClassificationPerformance(mlp.predict(X_test), y_test, 'mlp_test')
mlp_performance_test.compute_measures()
print(mlp_performance_test.performance_measures)
print("False Positive Rate:")
print(mlp_performance_test.performance_measures['FP'] / mlp_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(mlp_performance_test.performance_measures['TP'] / mlp_performance_test.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2254, 'TN': 27959, 'FP': 682, 'FN': 1020, 'Accuracy': 0.9466708444305185, 'Precision': 0.7677111716621253, 'Recall': 0.6884544899205864, 'desc': 'mlp_test'}
False Positive Rate:
0.023812017736810867
True Positive Rate:
0.6884544899205864


In [12]:
pac_performance_test = BinaryClassificationPerformance(pac.predict(X_test), y_test, 'pac_test')
pac_performance_test.compute_measures()
print(pac_performance_test.performance_measures)
print("False Positive Rate:")
print(pac_performance_test.performance_measures['FP'] / pac_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(pac_performance_test.performance_measures['TP'] / pac_performance_test.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2359, 'TN': 27827, 'FP': 814, 'FN': 915, 'Accuracy': 0.9458248472505092, 'Precision': 0.7434604475260006, 'Recall': 0.7205253512522908, 'desc': 'pac_test'}
False Positive Rate:
0.02842079536329039
True Positive Rate:
0.7205253512522908


In [9]:
pcp_performance_test = BinaryClassificationPerformance(pcp.predict(X_test), y_test, 'pcp_test')
pcp_performance_test.compute_measures()
print(pcp_performance_test.performance_measures)
print("False Positive Rate:")
print(pcp_performance_test.performance_measures['FP'] / pcp_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(pcp_performance_test.performance_measures['TP'] / pcp_performance_test.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2371, 'TN': 27763, 'FP': 878, 'FN': 903, 'Accuracy': 0.9441955193482688, 'Precision': 0.7297630040012312, 'Recall': 0.7241905925473427, 'desc': 'pcp_test'}
False Positive Rate:
0.030655354212492582
True Positive Rate:
0.7241905925473427


In [6]:
prc_performance_test = BinaryClassificationPerformance(prc.predict(X_test), y_test, 'prc_test')
prc_performance_test.compute_measures()
print(prc_performance_test.performance_measures)
print("False Positive Rate:")
print(prc_performance_test.performance_measures['FP'] / prc_performance_test.performance_measures['Neg'])
print("True Positive Rate:")
print(prc_performance_test.performance_measures['TP'] / prc_performance_test.performance_measures['Pos'])

{'Pos': 3274, 'Neg': 28641, 'TP': 2336, 'TN': 27557, 'FP': 1084, 'FN': 938, 'Accuracy': 0.9366442111859627, 'Precision': 0.6830409356725147, 'Recall': 0.7135003054367746, 'desc': 'prc_test'}
False Positive Rate:
0.03784784050836214
True Positive Rate:
0.7135003054367746


### MODEL: Ridge Regression Classifier

In [None]:
rdg_performance_test = BinaryClassificationPerformance(rdg.predict(X_test), y_test, 'test_0')
rdg_performance_test.compute_measures()
print(rdg_performance_test.performance_measures)

### MODEL: Random Forest Classifier

In [None]:
rdf_performance_test_20 = BinaryClassificationPerformance(rdf_20.predict(X_test), y_test, 'rdf_test_20')
rdf_performance_test_20.compute_measures()
print(rdf_performance_test_20.performance_measures)

### ROC plot to compare performance of various models and fits

In [None]:
fits = [svm_performance_test, lgs_performance_test, nbs_performance_test, prc_performance_test]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, .125, 0.685, 0.79])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.rcParams['figure.figsize'] = (15, 15)
plt.show()

### ROC plot: test NB

In [None]:
fits = [nbs_performance_test, cnb_performance_test]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'rx')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
    
# fits = [nbs_performance_train, nbs_performance_train_1, nbs_performance_train_3, nbs_performance_train_5, nbs_performance_train_n1, nbs_performance_train_n3, nbs_performance_train_n5, nbs_performance_train_n7]

# for fit in fits:
#     plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
#              fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
#     plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
#              fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)    

plt.axis([0, 1, 0, 1])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.rcParams['figure.figsize'] = (15, 15)
plt.show()

In [None]:
fits = [nbs_performance_test, cnb_performance_test_n5, cnb_performance_test, cnb_performance_test_50, cnb_performance_test_100, cnb_performance_test_150, cnb_performance_test_200, cnb_performance_test_250, cnb_performance_test_300, cnb_performance_test_350]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'rx')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
    
# fits = [nbs_performance_train, nbs_performance_train_1, nbs_performance_train_3, nbs_performance_train_n1, nbs_performance_train_n3, nbs_performance_train_n5, nbs_performance_train_n7]

# for fit in fits:
#     plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
#              fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
#     plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
#              fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)    

plt.axis([0, 1, 0, 1])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.rcParams['figure.figsize'] = (15, 15)
plt.grid(b=True)
plt.show()

---

# <span style="color:red">SUBMISSION</span>

---

In [31]:
# read in test data for submission
# CHANGE FILE PATH and my_random_seed number (any integer other than 74 will do): 
raw_data, X_test_submission = process_raw_data(fn='/Users/Charles/Desktop/ML/A1/toxiccomments_test.csv', my_random_seed=36, test=True)
print("Number of rows in the submission test set (should be 153,164): ")

toxic_data is: <class 'pandas.core.frame.DataFrame'>
toxic_data has 153164 rows and 2 columns 

the data types for each of the columns in toxic_data:
id              object
comment_text    object
dtype: object 

the first 5 rows in toxic_data:
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3  00017563c3f7919a  :If you have a look back at the source, the in...
4  00017695ad8997eb          I don't anonymously edit articles at all.
Shape of HashingVectorizer X:
(153164, 16384)
Shape of HashingVectorizer char n_gram X:
(153164, 16384)
Look at a few rows of the new quantitative features: 
   consCaps_count  consCaps_ratio  consCapsword_ratio  punc_q_ratio  \
0               0        0.000000            0.000000      0.009009   
1               1        0.020

In [32]:
# for 2 HashingV passed through respective tfidf transformer and then hstacked with quantfeat: AttributeError: lower not found
# for 2 HashingV hstacked and passed through tfidf transformer: TypeError: 'coo_matrix' object is not subscriptable

---

Choose a <span style="color:red">*single*</span> model for your submission. In this code, I am choosing the Ordinary Least Squares model fit, which is in the `ols` object. But you should choose the model that is performing the best for you! 

In [33]:
# store the id from the raw data
my_submission = pd.DataFrame(raw_data["id"])
# concatenate predictions to the id
my_submission["prediction"] = cnb.predict(X_test_submission)
# look at the proportion of positive predictions
print(my_submission['prediction'].mean())

0.3857499151236583


In [34]:
raw_data.head()

Unnamed: 0,id,comment_text,char_count,Caps_count,Caps_ratio,word_count,char_ratio,punc_count_p,punc_count_exc,punc_count_q,...,punc_ratio,punc_exc_ratio,punc_q_ratio,Capsword_ratio,spaces_count,spaces_ratio,spaceswords_ratio,consCaps_count,consCaps_ratio,consCapsword_ratio
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,367,4,3e-06,72,5.097222,10,0,0,...,0.152778,0.009009,0.009009,0.208333,71,0.19346,0.986111,0,0.0,0.0
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,50,7,0.004096,13,3.846154,1,0,0,...,0.153846,0.047619,0.047619,0.692308,12,0.24,0.923077,1,0.02,0.076923
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",54,4,0.000794,16,3.375,0,0,0,...,0.0625,0.090909,0.090909,0.3125,15,0.277778,0.9375,0,0.0,0.0
3,00017563c3f7919a,":If you have a look back at the source, the in...",205,4,1.5e-05,38,5.394737,3,0,0,...,0.105263,0.02439,0.02439,0.210526,37,0.180488,0.973684,0,0.0,0.0
4,00017695ad8997eb,I don't anonymously edit articles at all.,41,1,0.000116,7,5.857143,1,0,0,...,0.142857,0.090909,0.090909,0.285714,6,0.146341,0.857143,0,0.0,0.0


In [35]:
my_submission.head()

Unnamed: 0,id,prediction
0,00001cee341fdb12,True
1,0000247867823ef7,False
2,00013b17ad220c46,True
3,00017563c3f7919a,False
4,00017695ad8997eb,False


In [36]:
my_submission.shape

(153164, 2)

In [37]:
# export submission file as pdf
# CHANGE FILE PATH: 
my_submission.to_csv('/Users/Charles/Desktop/ML/A1/toxiccomments_submission.csv', index=False)

# Submit to Canvas: 1) the CSV file that was written in the previous cell and 2) the url to the repository (GitHub or other) that contains your code and documentation