In [96]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
%matplotlib inline

In [97]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels,on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

In [98]:
cores = multiprocessing.cpu_count()

y_train = train_data.iloc[:, 2:]
X_train = train_data[['comment_text']]

y_test = test_merged.iloc[:, 2:]
X_test = test_merged[['comment_text']]

lp = LabelPowerset()
y_train = lp.transform(y_train)
y_test = lp.transform(y_test)

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = tvec.transform(X_test['comment_text'])

In [99]:
lr = LogisticRegression(solver='saga', random_state = 27, n_jobs = cores).fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, lr_pred)))
print("F1 score: " + str(f1_score(y_test, lr_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, lr_pred, average = 'weighted')))

Test data
Accuracy score: 0.8869611428928694
F1 score: 0.8725668263497255
Recall score: 0.8869611428928694


In [100]:
rfc = RandomForestClassifier(random_state = 27, n_jobs = cores).fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))

Test data
Accuracy score: 0.8838350683047298
F1 score: 0.8705797126873954
Recall score: 0.8838350683047298


  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
rfc = RandomForestClassifier(class_weight = 'balanced', random_state = 27, n_jobs = cores).fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))

Test data
Accuracy score: 0.8133420863421801
F1 score: 0.8351263442135775
Recall score: 0.8133420863421801


  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
ovoc = OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores).fit(X_train, y_train)
ovoc_pred = ovoc.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, ovoc_pred)))
print("F1 score: " + str(f1_score(y_test, ovoc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, ovoc_pred, average = 'weighted')))

Test data
Accuracy score: 0.8856794523117322
F1 score: 0.8722232824910766
Recall score: 0.8856794523117322


In [103]:
nbc = MultinomialNB().fit(X_train, y_train)
nbc_pred = nbc.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, nbc_pred)))
print("F1 score: " + str(f1_score(y_test, nbc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, nbc_pred, average = 'weighted')))

Test data
Accuracy score: 0.903670011566476
F1 score: 0.8714419768553344
Recall score: 0.903670011566476


In [104]:
counter = Counter(y_train)
for elem in counter.most_common():
    count = elem[1]
    while count < 6:
        count += 1
        counter.update({elem[0], 1})

ros = RandomOverSampler(sampling_strategy = counter, random_state = 27)
X_tmp, y_tmp = ros.fit_sample(X_train, y_train)

sm = SMOTE(random_state = 27, n_jobs = cores)
X_train_resampled, y_train_resampled = sm.fit_sample(X_tmp, y_tmp)

In [None]:
lr = LogisticRegression(solver='saga', random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
lr_pred = lr.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, lr_pred)))
print("F1 score: " + str(f1_score(y_test, lr_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, lr_pred, average = 'weighted')))

In [None]:
rfc = RandomForestClassifier(random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
rfc_pred = rfc.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))

In [None]:
rfc = RandomForestClassifier(class_weight = 'balanced', random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
rfc_pred = rfc.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, rfc_pred)))
print("F1 score: " + str(f1_score(y_test, rfc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, rfc_pred, average = 'weighted')))

In [None]:
ovoc = OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores).fit(X_train_resampled, y_train_resampled)
ovoc_pred = ovoc.predict(X_test)

print("Test data from train csv")
print("Accuracy score: " + str(accuracy_score(y_test, ovoc_pred)))
print("F1 score: " + str(f1_score(y_test, ovoc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, ovoc_pred, average = 'weighted')))

In [None]:
nbc = MultinomialNB().fit(X_train_resampled, y_train_resampled)
nbc_pred = nbc.predict(X_test)

print("Test data")
print("Accuracy score: " + str(accuracy_score(y_test, nbc_pred)))
print("F1 score: " + str(f1_score(y_test, nbc_pred, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test, nbc_pred, average = 'weighted')))