In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
%matplotlib inline

In [None]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels, on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

In [None]:
cores = multiprocessing.cpu_count()

y_train = train_data.iloc[:, 2:]
X_train = train_data[['comment_text']]

y_test = test_merged.iloc[:, 2:]
X_test = test_merged[['comment_text']]

# lp = LabelPowerset()
# y_train = lp.transform(y_train)
# y_test = lp.transform(y_test)

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = tvec.transform(X_test['comment_text'])

In [31]:
y_test

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0,0,0,0,0,0
7,0,0,0,0,0,0
11,0,0,0,0,0,0
13,0,0,0,0,0,0
14,0,0,0,0,0,0
...,...,...,...,...,...,...
153150,0,0,0,0,0,0
153151,0,0,0,0,0,0
153154,0,0,0,0,0,0
153155,1,0,1,0,1,0


In [32]:
y_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [None]:
class MyClassifier:
    def __init__(self, clf0=None, clfs=None):
        self.clf0 = clf0
        self.clfs = clfs
        
    def set_clf0(self, clf0):
        self.clf0 = clf0
        
    def set_clfs(self, clfs):
        self.clfs = clfs

In [None]:
my_classifier = MyClassifier()

In [None]:
clf0 = SVC()

In [None]:
y_train_0_stage = [1 if any(row.values) else 0 for i, row in y_train.iterrows()]

In [None]:
y_test_0_stage = [1 if any(row.values) else 0 for i, row in y_test.iterrows()]

In [None]:
clf0.fit(X_train, y_train_0_stage)

In [None]:
y_pred_0_stage = clf0.predict(X_test)

In [160]:
print("Test data - first stage classification")
print("Accuracy score: " + str(accuracy_score(y_test_0_stage, y_pred_0_stage)))
print("F1 score: " + str(f1_score(y_test_0_stage, y_pred_0_stage, average = 'weighted')))
print("Recall score: " + str(recall_score(y_test_0_stage, y_pred_0_stage, average = 'weighted')))

Test data - first stage classification
Accuracy score: 0.9257244677858014
F1 score: 0.9247995559503928
Recall score: 0.9257244677858014


In [72]:
features = y_test.columns
features_len = len(features)

In [49]:
y_test_as_np = y_test.to_numpy()

In [50]:
y_pred_as_np = np.zeros(y_test.shape)

In [73]:
clfs = [SVC() for i in range(features_len)]

In [113]:
y_train_1_stages = [[] for i in range(features_len)]
iis = []
for i, row in y_train.iterrows():
    if not any(row.values):
        continue
    iis.append(i)
    for j, v in enumerate(row.values):
        y_train_1_stages[j].append(v)

In [114]:
y_train_1_stages = np.array(y_train_1_stages)

In [115]:
X_train_1_stage = X_train[iis]

In [116]:
X_train_1_stage.shape, y_train_1_stages.shape

((16225, 1000), (6, 16225))

In [117]:
for (i, clf) in enumerate(clfs):
    clf.fit(X_train_1_stage, y_train_1_stages[i])

In [118]:
y_pred_1_stages = [[] for i in range(features_len)]
for (i, clf) in enumerate(clfs):
    y_pred_1_stages[i] = clf.predict(X_test)    # Do predictions for each class

In [143]:
y_pred_1_stages = np.array(y_pred_1_stages)

In [144]:
y_pred_1_stages

array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [159]:
for (i, y0) in enumerate(y_pred_0_stage):
    if y0 == 0:
        y_pred_as_np[i,:] = 0                       # In case of first stage determining no label we write no labels there
    else:
        y_pred_as_np[i,:] = y_pred_1_stages[:,i]    # In case of first stage determining at least label we write labels from second stage

In [166]:
# Returns 1.0 - Hamming-Loss measure - number of labels correctly classified
def percent_of_labels_correctly_predicted(y_test, y_pred):
    return np.sum(y_test == y_pred) / y_test.size

In [168]:
for (i, f) in enumerate(features):
    y_test_f = y_test_as_np[:,i]
    y_pred_f = y_pred_as_np[:,i]
    print("Tested category:", f)
    print("Accuracy score: " + str(accuracy_score(y_test_f, y_pred_f)))
    print("F1 score: " + str(f1_score(y_test_f, y_pred_f, average = 'weighted')))
    print("Recall score: " + str(recall_score(y_test_f, y_pred_f, average = 'weighted')))
    print()

print("Overall accuracy (subset) score (exact match ratio):", accuracy_score(y_test_as_np, y_pred_as_np))
print()
print("Total percent of labels correctly predicted:", percent_of_labels_correctly_predicted(y_test_as_np, y_pred_as_np))

Tested category: toxic
Accuracy score: 0.926209009346963
F1 score: 0.9256988284363842
Recall score: 0.926209009346963

Tested category: severe_toxic
Accuracy score: 0.9939666760448904
F1 score: 0.9920031945437757
Recall score: 0.9939666760448904

Tested category: obscene
Accuracy score: 0.9569070618024946
F1 score: 0.9566656347288859
Recall score: 0.9569070618024946

Tested category: threat
Accuracy score: 0.9964987964612836
F1 score: 0.996012011839632
Recall score: 0.9964987964612836

Tested category: insult
Accuracy score: 0.9536403138578886
F1 score: 0.9531235611773381
Recall score: 0.9536403138578886

Tested category: identity_hate
Accuracy score: 0.9891525211791553
F1 score: 0.9866333784140313
Recall score: 0.9891525211791553

Overall accuracy (subset) score (exact match ratio): 0.8925880771515209

Total percent of labels correctly predicted: 0.9693957297821126
