In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, label_ranking_average_precision_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from collections import Counter
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

%matplotlib inline
cores = multiprocessing.cpu_count()

In [3]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels,on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

toxic_train_data = train_data[(train_data['toxic'] == 1) | (train_data['severe_toxic'] == 1) | (train_data['obscene'] == 1) |
                       (train_data['threat'] == 1) | (train_data['insult'] == 1) | (train_data['identity_hate'] == 1)]

toxic_test_data = test_merged[(test_merged['toxic'] == 1) | (test_merged['severe_toxic'] == 1) | (test_merged['obscene'] == 1) |
                       (test_merged['threat'] == 1) | (test_merged['insult'] == 1) | (test_merged['identity_hate'] == 1)]

In [4]:
y_train = toxic_train_data.iloc[:, 2:]
X_train = toxic_train_data[['comment_text']]

y_test = toxic_test_data.iloc[:, 2:]
X_test = toxic_test_data[['comment_text']]

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = tvec.transform(X_test['comment_text'])

X_train = lil_matrix(X_train).toarray()
y_train = lil_matrix(y_train).toarray()
X_test = lil_matrix(X_test).toarray()

In [6]:
classifier = RandomForestClassifier(random_state = 27, n_jobs = cores)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("MLkNN")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred)))

MLkNN
Accuracy score: 0.4300816914944738
Hamming loss: 0.13185434353142186
F1 score: 0.8015773558774071
LRAP: 0.8115061758057941


In [8]:
classifier = DecisionTreeClassifier(random_state = 27)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("MLkNN")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred)))

MLkNN
Accuracy score: 0.36777190453307707
Hamming loss: 0.15748304768006835
F1 score: 0.7860626651935952
LRAP: 0.7730135529570901
