In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
from imblearn.over_sampling import SVMSMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [2]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels,on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

In [3]:
display(test_merged)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


In [4]:
cores = multiprocessing.cpu_count()

y = train_data.iloc[:, 2:]
X = train_data[['comment_text']]

y_test_test = test_merged.iloc[:, 2:]
X_test_test = test_merged[['comment_text']]

lp = LabelPowerset()
y = lp.transform(y)
y_test_test = lp.transform(y_test_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 27)

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = tvec.transform(X_test['comment_text'])
X_test_test = tvec.transform(X_test_test['comment_text'])

In [8]:
lr = LogisticRegression(solver='saga', random_state = 27, n_jobs = cores).fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, lr_pred)))
print("F1 score: " + str(f1_score(y_test, lr_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, lr_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, lr_pred2)))
print("F1 score: " + str(f1_score(y_test_test, lr_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, lr_pred2, average = 'weighted')))

Test data from train csv
Accuracy score: 0.9143662854457152
F1 score: 0.8897001929107817
Recall score: 0.9143662854457152

Actual test data
Accuracy score: 0.8879458563881334
F1 score: 0.8728097677223877
Recall score: 0.8879458563881334


In [9]:
rfc = RandomForestClassifier(random_state = 27, n_jobs = cores).fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
rfc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, rfc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, rfc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, rfc_pred2, average = 'weighted')))

Test data from train csv
Accuracy score: 0.9115776280745731
F1 score: 0.8874720609412652
Recall score: 0.9115776280745731

Actual test data
Accuracy score: 0.8879458563881334
F1 score: 0.8728097677223877
Recall score: 0.8879458563881334


In [10]:
rfc = RandomForestClassifier(class_weight = 'balanced', random_state = 27, n_jobs = cores).fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
rfc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, rfc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, rfc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, rfc_pred2, average = 'weighted')))

Test data from train csv
Accuracy score: 0.8699671001096663
F1 score: 0.8642303255663182
Recall score: 0.8699671001096663

Actual test data
Accuracy score: 0.8879458563881334
F1 score: 0.8728097677223877
Recall score: 0.8879458563881334


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
ovoc = OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores).fit(X_train, y_train)
ovoc_pred = ovoc.predict(X_test)
ovoc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, ovoc_pred)))
print("F1 score: " + str(f1_score(y_test, ovoc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, ovoc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, ovoc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, ovoc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, ovoc_pred2, average = 'weighted')))

Test data from train csv
Accuracy score: 0.9140529531568228
F1 score: 0.8898666636700854
Recall score: 0.9140529531568228

Actual test data
Accuracy score: 0.8879458563881334
F1 score: 0.8728097677223877
Recall score: 0.8879458563881334


In [13]:
nbc = MultinomialNB().fit(X_train, y_train)
nbc_pred = nbc.predict(X_test)
nbc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, nbc_pred)))
print("F1 score: " + str(f1_score(y_test, nbc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, nbc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, nbc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, nbc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, nbc_pred2, average = 'weighted')))

Test data from train csv
Accuracy score: 0.9090396365345449
F1 score: 0.8734082954870714
Recall score: 0.9090396365345449

Actual test data
Accuracy score: 0.8879458563881334
F1 score: 0.8728097677223877
Recall score: 0.8879458563881334


In [None]:
sm = SVMSMOTE(sampling_strategy = 'minority', random_state = 27, n_jobs = cores)
X_train_resampled, y_train_resampled = sm.fit_sample(X_train, y_train)

In [None]:
lr = LogisticRegression(solver='saga', random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
lr_pred = lr.predict(X_test)
lr_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, lr_pred)))
print("F1 score: " + str(f1_score(y_test, lr_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, lr_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, lr_pred2)))
print("F1 score: " + str(f1_score(y_test_test, lr_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, lr_pred2, average = 'weighted')))

In [None]:
rfc = RandomForestClassifier(random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
rfc_pred = rfc.predict(X_test)
rfc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, rfc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, rfc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, rfc_pred2, average = 'weighted')))

In [None]:
rfc = RandomForestClassifier(class_weight = 'balanced', random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
rfc_pred = rfc.predict(X_test)
rfc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, rfc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, rfc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, rfc_pred2, average = 'weighted')))

In [None]:
ovoc = OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores).fit(X_train_resampled, y_train_resampled)
ovoc_pred = ovoc.predict(X_test)
ovoc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, ovoc_pred)))
print("F1 score: " + str(f1_score(y_test, ovoc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, ovoc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, ovoc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, ovoc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, ovoc_pred2, average = 'weighted')))

In [None]:
nbc = MultinomialNB().fit(X_train_resampled, y_train_resampled)
nbc_pred = nbc.predict(X_test)
nbc_pred2 = lr.predict(X_test_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, nbc_pred)))
print("F1 score: " + str(f1_score(y_test, nbc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, nbc_pred, average = 'weighted')))
print("\nActual test data")
print("Accuracy score: " + str(accuracy_score(y_test_test, nbc_pred2)))
print("F1 score: " + str(f1_score(y_test_test, nbc_pred2, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_test, nbc_pred2, average = 'weighted')))