In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 1000
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
import string
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import pickle

%load_ext autotime

# Training Set Cleaning

In [2]:
train_orig = pd.read_csv("/home/sroberts/train.csv", engine='python')

time: 59 s


In [3]:
train_orig[:2]

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4


time: 38.9 ms


In [3]:
# toxic comments
train_orig[train_orig.target >= 0.5][:2]

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47
5,59859,0.666667,ur a sh*tty comment.,0.047619,0.638095,0.0,0.333333,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:50.865549+00,2,,2006,rejected,0,0,0,0,0,0.009524,0,105


time: 112 ms


### Fill NaNs with Zeros

In [4]:
train_orig.fillna(0, inplace=True)

time: 846 ms


### describe/summarize

In [None]:
desc_df = train_orig.copy()
desc_df.loc[desc_df.target >= 0.5, "toxic"] = True
desc_df.toxic.fillna(False,inplace=True)

In [None]:
rel_col = list(set(desc_df.columns.tolist()) - set(['target','id']))
desc_df[rel_col].groupby("toxic").describe()

## lower case, stopwords, puncutation

In [None]:
#import nltk
#nltk.download('all')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')

In [6]:
english_stopwords = set(stopwords.words('english'))
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

time: 12.3 ms


In [7]:
def clean_message(msg):
    msg = msg.lower()
    msg_tokens = nltk.word_tokenize(msg)
    clean_msg_tokens = [w for w in msg_tokens if w not in english_stopwords]
    clean_msg_tokens_puct = [w for w in clean_msg_tokens if w not in string.punctuation]
    lemmatized_token = [lemmatizer.lemmatize(w) for w in clean_msg_tokens_puct]
    return lemmatized_token

time: 1.01 ms


In [8]:
train_orig["clean"] = train_orig.comment_text.apply(lambda x: " ".join([words.strip(string.punctuation).lower()
for words in [word for word in x.split() if (word not in string.punctuation) & (word not in english_stopwords)]]))
train_orig["clean"] = train_orig["clean"].apply(lambda x: " ".join([lemmatizer.lemmatize(w) 
                                                                    for w in w_tokenizer.tokenize(x)]))

time: 4min 35s


In [9]:
train_orig['clean'][:2]

0          this cool it's like would want mother read this really great idea well done
1    thank you this would make life lot le anxiety-inducing keep up let anyone get way
Name: clean, dtype: object

time: 4.34 ms


## vectorize

In [11]:
voc_counter = Counter(" ".join(train_orig.clean).split(" "))

time: 16.5 s


In [12]:
voc_set = dict((k, v) for k, v in voc_counter.items() if v > 10)
voc_dict = {w:i for w,i in zip(voc_set, range(len(voc_set)))}

time: 139 ms


In [13]:
train_orig["clean"] = train_orig["clean"].apply(lambda x: x.split())
train_orig.loc[train_orig.target >= 0.5, "toxic"] = True
train_orig.toxic.fillna(False,inplace=True)

time: 12.5 s


In [14]:
X = np.zeros((len(train_orig), len(voc_set)), dtype = np.int8 )
for msg_idx, msg in enumerate(train_orig['clean']):
    for word in set(msg):
        if word in voc_dict:
            X[msg_idx][voc_dict[word]] = msg.count(word)
            
y = np.where(train_orig.toxic == True, 1, 0)

time: 1min 33s


In [None]:
X.shape

# Test Data

### Apply same methods to test data

In [None]:
test_orig = pd.read_csv("/home/sroberts/test.csv", engine='python')
test_orig.fillna(0, inplace=True)
test_orig['clean'] = test_orig.comment_text.apply(lambda x: " ".join([words.strip(string.punctuation).lower()
for words in [word for word in x.split() if (word not in string.punctuation) & (word not in english_stopwords)]]))
test_orig["clean"] = test_orig["clean"].apply(lambda x: " ".join([lemmatizer.lemmatize(w) 
                                                                    for w in w_tokenizer.tokenize(x)]))
test_orig["clean"] = test_orig["clean"].apply(lambda x: x.split())

X_final = np.zeros((len(test_orig), len(voc_set)), dtype = np.int8 )
for msg_idx, msg in enumerate(test_orig['clean']):
    for word in set(msg):
        if word in voc_dict:
            X_final[msg_idx][voc_dict[word]] = msg.count(word)

# Model

## SGDClassifier

Split data into train and test

In [17]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=.2,random_state=2)

time: 1min 28s


In [18]:
lr = SGDClassifier(loss="log", n_jobs=-1, class_weight={0: 0.1,1:0.9})

#classes = np.unique(["toxic", "not_toxic"])
classes = np.unique([0, 1])

time: 958 µs


Create batches of data for training model

In [19]:
minibatches = [(train_X[:100000,:],train_y[:100000]),
               (train_X[100001:200000,:],train_y[100001:200000]),
               (train_X[200001:300000,:],train_y[200001:300000]),
               (train_X[300001:400000,:],train_y[300001:400000]),
               (train_X[400001:500000,:],train_y[400001:500000]),
               (train_X[500001:600000,:],train_y[500001:600000]),
               (train_X[600001:700000,:],train_y[600001:700000]),
               (train_X[700001:800000,:],train_y[700001:800000]),
               (train_X[800001:900000,:],train_y[800001:900000]),
               (train_X[900001:1000000,:],train_y[900001:1000000]),
               (train_X[1000001:1100000,:],train_y[1000001:1100000]),
               (train_X[1100001:1200000,:],train_y[1100001:1200000]),
               (train_X[1200001:1300000,:],train_y[1200001:1300000]),
               (train_X[1300001:1400000,:],train_y[1300001:1400000]),
               (train_X[1400001:,:],train_y[1400001:])]

time: 5.53 ms


In [20]:
for xs, ys in minibatches:
    lr.partial_fit(xs, ys, classes=classes)

time: 15min 9s


#### Save model

In [None]:
with open('SGDC.pkl', 'wb') as fid:
    pickle.dump(lr, fid)    

In [None]:
with open('SGDC.pkl', 'rb') as fid:
    lr = pickle.load(fid)

### Metrics

See how model did on our test sets

In [21]:
print(lr.score(test_X, test_y))
print(lr.intercept_)

0.9188918900200845

time: 3min 9s


Let's look deeper at our model's accuracy

In [26]:
pred_y = lr.predict(test_X)
print(confusion_matrix(test_y, pred_y))
print(precision_score(test_y, pred_y))
print(recall_score(test_y, pred_y))
print(f1_score(test_y, pred_y))

time: 3min


In [None]:
sgdc_test = lr.predict(X_final)
with open('sgdc_test.pkl', 'wb') as fid:
    pickle.dump(sgdc_test, fid) 

## Use class weights

In [47]:
lr = SGDClassifier(loss="log", n_jobs=-1, class_weight={0: 0.15,1:0.85})

for xs, ys in minibatches:
    lr.partial_fit(xs, ys, classes=classes)
    
with open('SGDC_classwieghts_15_85.pkl', 'wb') as fid:
    pickle.dump(lr, fid) 

0.9335521850543667
[[320218  11972]
 [ 12014  16771]]
0.5830552078987623
time: 21min


In [None]:
with open('SGDC_classwieghts_15_85.pkl', 'rb') as fid:
    lr = pickle.load(fid)

In [None]:
print(lr.score(test_X, test_y))
print(lr.intercept_)
pred_y = lr.predict(test_X)
print(confusion_matrix(test_y, pred_y))
print(precision_score(test_y, pred_y))
print(recall_score(test_y, pred_y))
print(f1_score(test_y, pred_y))

In [None]:
sgdc_test = lr.predict(X_final)
with open('sgdc_weights_test.pkl', 'wb') as fid:
    pickle.dump(sgdc_test, fid) 

# MLP Classifier

In [25]:
clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
classes = np.unique([0, 1])

time: 862 µs


In [26]:
for xs, ys in minibatches:
    clf.partial_fit(xs, ys, classes=classes)

time: 58min 44s


Save model

In [None]:
with open('mlpclassifier1.pkl', 'wb') as fid:
    pickle.dump(clf, fid)

In [None]:
with open('mlpclassifier1.pkl', 'rb') as fid:
    clf = pickle.load(fid)

### Metrics

In [27]:
print(clf.score(test_X, test_y))
print(clf.coefs_)
print(clf.intercepts_)

0.07974236442966964

time: 2min 27s


In [None]:
pred_y = clf.predict(test_X)
print(confusion_matrix(test_y, pred_y))
print(precision_score(test_y, pred_y))
print(recall_score(test_y, pred_y))
print(f1_score(test_y, pred_y))

In [None]:
mlp_test = clf.predict(X_final)
with open('mlp_test.pkl', 'wb') as fid:
    pickle.dump(mlp_test, fid) 

## Use class weights

In [None]:
mlp_test = clf.predict(X_final)
with open('mlp_weights_test.pkl', 'wb') as fid:
    pickle.dump(mlp_test, fid) 

# Naive Bayes

In [31]:
nb = MultinomialNB()

time: 5.47 ms


In [34]:
for xs, ys in minibatches:
    nb.partial_fit(xs, ys, classes=classes)

MemoryError: 

time: 15min 38s


Save model

In [None]:
with open('naivebayes.pkl', 'wb') as fid:
    pickle.dump(nb, fid) 

In [None]:
with open('naivebayes.pkl', 'rb') as fid:
    nb = pickle.load(fid)

### Metrics

In [None]:
print(nb.score(test_X, test_y))
print(nb.intercept_)
print(nb.coef_)
print(nb.feature_count_)
print(nb.class_count_)

In [None]:
pred_y = nb.predict(test_X)
print(confusion_matrix(test_y, pred_y))
print(precision_score(test_y, pred_y))
print(recall_score(test_y, pred_y))
print(f1_score(test_y, pred_y))

In [None]:
nb_test = nb.predict(X_final)
with open('nb_test.pkl', 'wb') as fid:
    pickle.dump(nb_test, fid) 

## Use class weights

In [43]:
nb= MultinomialNB(alpha= 0.1, fit_prior = True, class_prior = [0.15, 0.85])
for xs, ys in minibatches:
    nb.partial_fit(xs, ys, classes=classes)

0.45791813837523376
[[138623 193567]
 [  2111  26674]]
0.2142266269385526
time: 19min 33s


In [None]:
with open('naivebayes_weights_class_weights.pkl', 'wb') as fid:
    pickle.dump(nb, fid) 

In [None]:
with open('naivebayes_weights_class_weights.pkl', 'rb') as fid:
    nb = pickle.load(fid)

In [None]:
print(nb.score(test_X, test_y))
print(nb.intercept_)
print(nb.coef_)
print(nb.feature_count_)
print(nb.class_count_)

In [None]:
pred_y = nb.predict(test_X)
print(confusion_matrix(test_y, pred_y))
print(precision_score(test_y, pred_y))
print(recall_score(test_y, pred_y))
print(f1_score(test_y, pred_y))

In [None]:
nb_test = nb.predict(X_final)
with open('nb_weights_test.pkl', 'wb') as fid:
    pickle.dump(nb_test, fid) 

# RandomForestClassifier

In [None]:
rf = RandomForestClassifier(warm_start=True, n_estimators=100, max_depth=8, n_jobs=-1)

In [None]:
for xs, ys in minibatches:
    rf.fit(xs,ys)
    rf.n_estimators += 1

save model

In [None]:
with open('randomforest.pkl', 'wb') as fid:
    pickle.dump(rf, fid)

In [None]:
with open('randomforest.pkl', 'rb') as fid:
    rf = pickle.load(fid)

In [None]:
print(rf.score(test_X, test_y))
print(rf.estimators_)
print(rf.n_features_)
print(rf.n_outputs_)
print(rf.feature_importances_)

In [None]:
pred_y = rf.predict(test_X)
print(confusion_matrix(test_y, pred_y))
print(precision_score(test_y, pred_y))
print(recall_score(test_y, pred_y))
print(f1_score(test_y, pred_y))

In [None]:
rf_test = rf.predict(X_final)
with open('rf_test.pkl', 'wb') as fid:
    pickle.dump(rf_test, fid) 

## Use class weights

In [None]:
rf = RandomForestClassifier(warm_start=True, n_estimators=100, max_depth=8, n_jobs=-1, class_weight={0: 0.15,1:0.85})

In [None]:
for xs, ys in minibatches:
    rf.fit(xs,ys)
    rf.n_estimators += 1

save model

In [None]:
with open('randomforest_classweights.pkl', 'wb') as fid:
    pickle.dump(rf, fid)

In [None]:
with open('randomforest_classweights.pkl', 'rb') as fid:
    rf = pickle.load(fid)

In [None]:
print(rf.score(test_X, test_y))
print(rf.estimators_)
print(rf.n_features_)
print(rf.n_outputs_)
print(rf.feature_importances_)

In [None]:
pred_y = rf.predict(test_X)
print(confusion_matrix(test_y, pred_y))
print(precision_score(test_y, pred_y))
print(recall_score(test_y, pred_y))
print(f1_score(test_y, pred_y))

In [None]:
rf_test = rf.predict(X_final)
with open('rf_weights_test.pkl', 'wb') as fid:
    pickle.dump(rf_test, fid) 