In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, label_ranking_average_precision_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from collections import Counter
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

%matplotlib inline
cores = multiprocessing.cpu_count()

In [3]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels,on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

toxic_train_data = train_data[(train_data['toxic'] == 1) | (train_data['severe_toxic'] == 1) | (train_data['obscene'] == 1) |
                       (train_data['threat'] == 1) | (train_data['insult'] == 1) | (train_data['identity_hate'] == 1)]

toxic_test_data = test_merged[(test_merged['toxic'] == 1) | (test_merged['severe_toxic'] == 1) | (test_merged['obscene'] == 1) |
                       (test_merged['threat'] == 1) | (test_merged['insult'] == 1) | (test_merged['identity_hate'] == 1)]

In [4]:
y_train = toxic_train_data.iloc[:, 2:]
X_train = toxic_train_data[['comment_text']]

y_test = toxic_test_data.iloc[:, 2:]
X_test = toxic_test_data[['comment_text']]

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = tvec.transform(X_test['comment_text'])

In [5]:
classifier = BinaryRelevance(GaussianNB())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("GaussianNB")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

GaussianNB
Accuracy score: 0.038923594425756845
Hamming loss: 0.4663623258049015
F1 score: 0.6602677061947089
LRAP: 0.45383820990620427


In [6]:
classifier = BinaryRelevance(MultinomialNB())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("MultinomialNB")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

MultinomialNB
Accuracy score: 0.4015697581291046
Hamming loss: 0.13940947194190828
F1 score: 0.7858618158151908
LRAP: 0.7947329898375096


In [7]:
classifier = BinaryRelevance(LogisticRegression(solver='saga', random_state = 27, n_jobs = cores))
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("LogisticRegression, solver = saga")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

LogisticRegression, solver = saga
Accuracy score: 0.44081371135671954
Hamming loss: 0.12552725719472477
F1 score: 0.818340057332087
LRAP: 0.8188458629220295


In [8]:
classifier = BinaryRelevance(RandomForestClassifier(random_state = 27, n_jobs = cores))
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("RandomForestClassifier")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

RandomForestClassifier
Accuracy score: 0.4263975652731059
Hamming loss: 0.1288643280474131
F1 score: 0.8127047537104652
LRAP: 0.8087762293769047


In [9]:
classifier = BinaryRelevance(DecisionTreeClassifier(random_state = 27))
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("DecisionTreeClassifier")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

DecisionTreeClassifier
Accuracy score: 0.34118212397885633
Hamming loss: 0.1565753644081371
F1 score: 0.7933657971451523
LRAP: 0.7612575417801286


In [10]:
classifier = BinaryRelevance(OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores))
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("OneVsOneClassifier, LinearSVC")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

OneVsOneClassifier, LinearSVC
Accuracy score: 0.4411340701585776
Hamming loss: 0.12558065032836777
F1 score: 0.8222190384807935
LRAP: 0.8159572054033849


In [11]:
lp = LabelPowerset()
y_tmp = lp.transform(y_train)

counter = Counter(y_tmp)
for elem in counter.most_common():
    count = elem[1]
    while count < 6:
        count += 1
        counter.update({elem[0], 1})
        
ros = RandomOverSampler(sampling_strategy = counter, random_state = 27)
X_tmp, y_tmp = ros.fit_sample(X_train, y_tmp)

sm = SMOTE(random_state = 27, n_jobs = cores)
X_train_resampled, y_train_resampled = sm.fit_sample(X_tmp, y_tmp)

y_train_resampled = lp.inverse_transform(y_train_resampled)

  n_samples_majority,


In [12]:
classifier = BinaryRelevance(GaussianNB())
classifier.fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("GaussianNB resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

GaussianNB resampled
Accuracy score: 0.06839660419670031
Hamming loss: 0.3742591702707032
F1 score: 0.6725697623426748
LRAP: 0.5182015857760663


In [13]:
classifier = BinaryRelevance(MultinomialNB())
classifier.fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("MultinomialNB resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

MultinomialNB resampled
Accuracy score: 0.18164344065353197
Hamming loss: 0.23933472155480806
F1 score: 0.761220740250326
LRAP: 0.631666711160945


In [14]:
classifier = BinaryRelevance(LogisticRegression(solver='saga', random_state = 27, n_jobs = cores))
classifier.fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("LogisticRegression, solver = saga resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

LogisticRegression, solver = saga resampled
Accuracy score: 0.19477815152971328
Hamming loss: 0.2288963639275989
F1 score: 0.7643645165666698
LRAP: 0.6464257746453782


In [None]:
classifier = BinaryRelevance(RandomForestClassifier(random_state = 27, n_jobs = cores))
classifier.fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("RandomForestClassifier resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

In [None]:
classifier = BinaryRelevance(DecisionTreeClassifier(random_state = 27))
classifier.fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("DecisionTreeClassifier resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))

In [None]:
classifier = BinaryRelevance(OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores))
classifier.fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("OneVsOneClassifier, LinearSVC resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))