Import necessary packages

In [1]:
import pandas as pd
import os
import io
import numpy as np
from pandas import DataFrame
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

Read in training data

In [2]:
df = pd.read_csv('train.csv')

Split up train dataframe into training and testing sets for model selection

In [3]:
msk = int(round(0.8*len(df),0 ))

train = df[:msk]
test = df[msk:]

Vectorize comment_text and put in TF-IDF form

In [4]:
vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words='english')

train_tfidf = vectorizer.fit_transform(train['comment_text'].values)
test_tfidf = vectorizer.transform(test['comment_text'].values)

Perform feature selection

In [5]:
selector = SelectPercentile(f_classif, percentile = 4)

selector.fit(train_tfidf,train['toxic'].values)
features_train_toxic = selector.transform(train_tfidf).toarray()
features_test_toxic = selector.transform(test_tfidf).toarray()

selector.fit(train_tfidf,train['severe_toxic'].values)
features_train_severe_toxic = selector.transform(train_tfidf).toarray()
features_test_severe_toxic = selector.transform(test_tfidf).toarray()

selector.fit(train_tfidf,train['obscene'].values)
features_train_obscene = selector.transform(train_tfidf).toarray()
features_test_obscene = selector.transform(test_tfidf).toarray()

selector.fit(train_tfidf,train['threat'].values)
features_train_threat = selector.transform(train_tfidf).toarray()
features_test_threat = selector.transform(test_tfidf).toarray()

selector.fit(train_tfidf,train['insult'].values)
features_train_insult = selector.transform(train_tfidf).toarray()
features_test_insult = selector.transform(test_tfidf).toarray()

selector.fit(train_tfidf,train['identity_hate'].values)
features_train_identity_hate = selector.transform(train_tfidf).toarray()
features_test_identity_hate = selector.transform(test_tfidf).toarray()

Create bayesian classifiers for: toxic, severe_toxic, obscene, threat, insult, and identity_hate comments

In [6]:
toxic_classifier = MultinomialNB()
toxic_NB = toxic_classifier.fit(features_train_toxic, train['toxic'].values)

In [7]:
severe_toxic_classifier = MultinomialNB()
severe_toxic_NB = severe_toxic_classifier.fit(features_train_severe_toxic, train['severe_toxic'].values)

In [8]:
obscene_classifier = MultinomialNB()
obscene_NB = obscene_classifier.fit(features_train_obscene, train['obscene'].values)

In [9]:
threat_classifier = MultinomialNB()
threat_NB = threat_classifier.fit(features_train_threat, train['threat'].values)

In [10]:
insult_classifier = MultinomialNB()
insult_NB = insult_classifier.fit(features_train_insult, train['insult'].values)

In [11]:
identity_hate_classifier = MultinomialNB()
identity_hate_NB = identity_hate_classifier.fit(features_train_identity_hate, train['identity_hate'].values)

Make predictions

In [12]:
predicted_toxic = toxic_NB.predict(features_test_toxic)
predicted_severe_toxic = severe_toxic_NB.predict(features_test_severe_toxic)
predicted_obscene = obscene_NB.predict(features_test_obscene)
predicted_threat = threat_NB.predict(features_test_threat)
predicted_insult = insult_NB.predict(features_test_insult)
predicted_identity_hate = identity_hate_NB.predict(features_test_identity_hate)

Confusion matrix

In [13]:
conf_mats=[]

toxic_matrix = pd.crosstab(predicted_toxic, test['toxic'].values)
severe_toxic_matrix = pd.crosstab(predicted_severe_toxic, test['severe_toxic'].values)
obscene_matrix = pd.crosstab(predicted_obscene, test['obscene'].values)
threat_matrix = pd.crosstab(predicted_threat, test['threat'].values)
insult_matrix = pd.crosstab(predicted_insult, test['insult'].values)
identity_hate_matrix = pd.crosstab(predicted_identity_hate, test['identity_hate'].values)

conf_mats.append(toxic_matrix)
conf_mats.append(severe_toxic_matrix)
conf_mats.append(obscene_matrix)
conf_mats.append(threat_matrix)
conf_mats.append(insult_matrix)
conf_mats.append(identity_hate_matrix)

out = pd.concat(conf_mats,axis=1,keys = ['toxic','severe_toxic','obscene','threat','insult'])

out

Unnamed: 0_level_0,toxic,toxic,severe_toxic,severe_toxic,obscene,obscene,threat,threat,insult,insult
col_0,0,1,0,1,0,1,0,1,0,1
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,28815,1727,31575,274,30205,1012,31821,90,30261,1115
1,62,1310,28,37,40,657,1,2,71,467


Try non-linear loss function

In [17]:
toxic_classifier = SGDClassifier(loss="hinge", max_iter=10).fit(train_tfidf, train['toxic'].values)
predicted_toxic = toxic_classifier.predict(test_tfidf)  

severe_toxic_classifier = SGDClassifier(loss="hinge", max_iter=10).fit(train_tfidf, train['severe_toxic'].values)
predicted_severe_toxic = severe_toxic_classifier.predict(test_tfidf)   

obscene_classifier = SGDClassifier(loss="hinge", max_iter=10).fit(train_tfidf, train['obscene'].values)
predicted_obscene = obscene_classifier.predict(test_tfidf)   

threat_classifier = SGDClassifier(loss="hinge", max_iter=10).fit(train_tfidf, train['threat'].values)
predicted_threat = threat_classifier.predict(test_tfidf)   

insult_classifier = SGDClassifier(loss="hinge", max_iter=10).fit(train_tfidf, train['insult'].values)
predicted_insult = insult_classifier.predict(test_tfidf)   

identity_hate_classifier = SGDClassifier(loss="hinge", max_iter=10).fit(train_tfidf, train['identity_hate'].values)
predicted_identity_hate = identity_hate_classifier.predict(test_tfidf)   

Confusion Matrix

In [19]:
conf_mats=[]

toxic_matrix = pd.crosstab(predicted_toxic, test['toxic'].values)
severe_toxic_matrix = pd.crosstab(predicted_severe_toxic, test['severe_toxic'].values)
obscene_matrix = pd.crosstab(predicted_obscene, test['obscene'].values)
threat_matrix = pd.crosstab(predicted_threat, test['threat'].values)
insult_matrix = pd.crosstab(predicted_insult, test['insult'].values)
identity_hate_matrix = pd.crosstab(predicted_identity_hate, test['identity_hate'].values)

conf_mats.append(toxic_matrix)
conf_mats.append(severe_toxic_matrix)
conf_mats.append(obscene_matrix)
conf_mats.append(threat_matrix)
conf_mats.append(insult_matrix)
conf_mats.append(identity_hate_matrix)

out = pd.concat(conf_mats,axis=1,keys = ['toxic','severe_toxic','obscene','threat','insult'])

out


Unnamed: 0_level_0,toxic,toxic,severe_toxic,severe_toxic,obscene,obscene,threat,threat,insult,insult
col_0,0,1,0,1,0,1,0,1,0,1
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,28824,1646,31603.0,311.0,30189,700,31822.0,92.0,30188,925
1,53,1391,,,56,969,,,144,657


Calculate test predictions for competition

In [20]:
test = pd.read_csv('test.csv')

In [21]:
test_tfidf = vectorizer.transform(test['comment_text'].values)

Classify test data set

In [22]:
predicted_toxic = toxic_classifier.predict(test_tfidf)  
predicted_severe_toxic = severe_toxic_classifier.predict(test_tfidf)   
predicted_obscene = obscene_classifier.predict(test_tfidf)   
predicted_threat = threat_classifier.predict(test_tfidf)   
predicted_insult = insult_classifier.predict(test_tfidf)   
predicted_identity_hate = identity_hate_classifier.predict(test_tfidf)   

Compile Results

In [23]:
test['predicted_toxic'] = predicted_toxic
test['predicted_severe_toxic'] = predicted_severe_toxic
test['predicted_obscene'] = predicted_obscene
test['predicted_threat'] = predicted_threat
test['predicted_insult'] = predicted_insult
test['predicted_identity_hate'] = predicted_identity_hate




Potential improvements