In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
# from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
%matplotlib inline

In [2]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels, on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

In [3]:
cores = multiprocessing.cpu_count()

y_train = train_data.iloc[:, 2:]
X_train = train_data[['comment_text']]

y_test = test_merged.iloc[:, 2:]
X_test = test_merged[['comment_text']]

# lp = LabelPowerset()
# y_train = lp.transform(y_train)
# y_test = lp.transform(y_test)

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = tvec.transform(X_test['comment_text'])

In [4]:
y_test

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0,0,0,0,0,0
7,0,0,0,0,0,0
11,0,0,0,0,0,0
13,0,0,0,0,0,0
14,0,0,0,0,0,0
...,...,...,...,...,...,...
153150,0,0,0,0,0,0
153151,0,0,0,0,0,0
153154,0,0,0,0,0,0
153155,1,0,1,0,1,0


In [5]:
y_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [6]:
y_test_as_np = y_test.to_numpy()

In [7]:
features = y_train.columns

In [8]:
for i, f in enumerate(features):
    print(i, f)

0 toxic
1 severe_toxic
2 obscene
3 threat
4 insult
5 identity_hate


In [9]:
def change_y_to_0_stage_version(y):
    return [1 if any(row.values) else 0 for i, row in y.iterrows()]

In [10]:
class TwoStageClassifier:
    def __init__(self, X_train, X_test, y_train, clf0=None, clfs=None):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        
        if clf0 is None:
            clf0 = SVC()
        if clfs is None:
            clfs = [SVC() for i in range(self.y_train.shape[1])]
            
        self.clf0 = clf0
        self.clfs = clfs
        
        self.y_pred_0_stage = None
        self.y_pred_1_stage = None
        
    def set_clf0(self, clf0):
        self.clf0 = clf0
        self._clear_y_pred()
        
    def set_clfs(self, clfs):
        self.clfs = clfs
        self._clear_y_pred()

        
    def predict(self):
        return self.predict_1_stage()
    
    def predict_1_stage(self):
        if self.y_pred_1_stage is None:
            self._proceed_1_stage()
        return self.y_pred_1_stage
        
    def predict_0_stage(self):
        if self.y_pred_0_stage is None:
            self._proceed_0_stage()
        return self.y_pred_0_stage
    
    def _proceed_1_stage(self):
        if self.y_pred_0_stage is None:
            self._proceed_0_stage()
        
        features_num = self.y_train.shape[1]
        y_pred_as_np = np.zeros((self.X_test.shape[0], features_num))
        y_train_1_stages = [[] for i in range(features_num)]
        inds = []
        for i, row in self.y_train.iterrows():
            if not any(row.values):
                continue
            inds.append(i)
            for j, v in enumerate(row.values):
                y_train_1_stages[j].append(v)
                
        y_train_1_stages = np.array(y_train_1_stages)
        X_train_1_stage = self.X_train[inds]
        
        for (i, clf) in enumerate(self.clfs):
            clf.fit(X_train_1_stage, y_train_1_stages[i])
        
        y_pred_1_stages = [[] for i in range(features_num)]
        for (i, clf) in enumerate(self.clfs):
            y_pred_1_stages[i] = clf.predict(self.X_test)    # Do predictions for each class
        y_pred_1_stages = np.array(y_pred_1_stages)
        
        for (i, y0) in enumerate(self.y_pred_0_stage):
            if y0 == 0:
                y_pred_as_np[i,:] = 0                       # In case of first stage determining no label we write no labels there
            else:
                y_pred_as_np[i,:] = y_pred_1_stages[:,i]    # In case of first stage determining at least label we write labels from second stage
        
        self.y_pred_1_stage = y_pred_as_np
        
    def _proceed_0_stage(self):
        y_train_0_stage = change_y_to_0_stage_version(self.y_train)
        self.clf0.fit(self.X_train, y_train_0_stage)
        self.y_pred_0_stage = self.clf0.predict(self.X_test)
        
    def _clear_y_pred(self):
        self.y_pred_0_stage = None
        self.y_pred_1_stage = None

In [11]:
def print_predicted_values_statistics(y_test, y_pred, title="Classification results:", algorithm=None):
    print(title)
    if algorithm is not None:
        print("Used algorithm:", algorithm)
    print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
    print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
    print("Recall score: " + str(recall_score(y_test, y_pred, average = 'weighted')))

In [12]:
# Returns 1.0 - Hamming-Loss measure - number of labels correctly classified
def percent_of_labels_correctly_predicted(y_test, y_pred):
    return np.sum(y_test == y_pred) / y_test.size

In [13]:
def test_two_stage_classifier_with_algorithms(clf0=None, clfs=None):
    my_classifier = TwoStageClassifier(X_train=X_train, X_test=X_test, y_train=y_train)
    if clf0 is not None:
        my_classifier.set_clf0(clf0)
    if clfs is not None:
        my_classifier.set_clfs(clfs)
    
    y_pred_0_stage = my_classifier.predict_0_stage()
    y_test_0_stage = change_y_to_0_stage_version(y_test)
    print_predicted_values_statistics(y_test=y_test_0_stage, y_pred=y_pred_0_stage, 
                                      title="Test data - first stage classification", 
                                      algorithm=my_classifier.clf0)
    
    y_pred_as_np = my_classifier.predict()
    for (i, f) in enumerate(features):
        y_test_f = y_test_as_np[:,i]
        y_pred_f = y_pred_as_np[:,i]
        print_predicted_values_statistics(y_test=y_test_f, y_pred=y_pred_f, 
                                          title="Tested category: "+str(f), 
                                          algorithm=my_classifier.clfs[i])
        print()
    
    print("Overall accuracy (subset) score (exact match ratio):", 
          accuracy_score(y_test_as_np, y_pred_as_np))
    print()
    print("Total percent of labels correctly predicted:", 
          percent_of_labels_correctly_predicted(y_test_as_np, y_pred_as_np))
    return my_classifier

In [14]:
random_state = 44

In [15]:
svc_clf0 = SVC(random_state=random_state)
svc_clfs = [SVC(random_state=random_state) for f in features]

In [16]:
svc_for_both_stages_classifier = test_two_stage_classifier_with_algorithms(clf0=svc_clf0, clfs=svc_clfs)

Test data - first stage classification
Used algorithm: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=44, shrinking=True, tol=0.001,
    verbose=False)
Accuracy score: 0.9257244677858014
F1 score: 0.9247995559503928
Recall score: 0.9257244677858014
Tested category: toxic
Used algorithm: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=44, shrinking=True, tol=0.001,
    verbose=False)
Accuracy score: 0.926209009346963
F1 score: 0.9256988284363842
Recall score: 0.926209009346963

Tested category: severe_toxic
Used algorithm: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, proba

In [17]:
rf_clf0 = RandomForestClassifier(random_state=random_state)
rf_clfs = [RandomForestClassifier(random_state=random_state) for f in features]

In [18]:
rf_for_both_stages_classifier = test_two_stage_classifier_with_algorithms(clf0=rf_clf0, clfs=rf_clfs)

Test data - first stage classification
Used algorithm: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=44, verbose=0,
                       warm_start=False)
Accuracy score: 0.918847103691894
F1 score: 0.9201532462516301
Recall score: 0.918847103691894
Tested category: toxic
Used algorithm: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min

In [19]:
mnb_clf0 = MultinomialNB()
mnb_clfs = [MultinomialNB() for f in features]

In [20]:
mnb_for_both_stages_classifier = test_two_stage_classifier_with_algorithms(clf0=mnb_clf0, clfs=mnb_clfs)

Test data - first stage classification
Used algorithm: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy score: 0.9321641814373691
F1 score: 0.9255215267541723
Recall score: 0.9321641814373691
Tested category: toxic
Used algorithm: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy score: 0.9334927631373284
F1 score: 0.927330806684636
Recall score: 0.9334927631373284

Tested category: severe_toxic
Used algorithm: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy score: 0.9942949138766451
F1 score: 0.9917449664997433
Recall score: 0.9942949138766451

Tested category: obscene
Used algorithm: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy score: 0.9595173340835912
F1 score: 0.9581363001822835
Recall score: 0.9595173340835912

Tested category: threat
Used algorithm: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy score: 0.9967019913095126
F1 score: 0.9950866521138687
Recall score: 0.99670199130951

In [21]:
lr_clf0 = LogisticRegression(solver='saga', random_state=random_state)
lr_clfs = [LogisticRegression(solver='saga', random_state=random_state) for f in features]

In [22]:
lr_for_both_stages_classifier = test_two_stage_classifier_with_algorithms(clf0=lr_clf0, clfs=lr_clfs)

Test data - first stage classification
Used algorithm: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=44, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)
Accuracy score: 0.9258807715152083
F1 score: 0.9249521720362812
Recall score: 0.9258807715152083
Tested category: toxic
Used algorithm: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=44, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)
Accuracy score: 0.9262715308387258
F1 score: 0.9257561745313889
Recall score: 0.9262715308387258

Tested category: severe_toxic
Used algorithm: LogisticRegression(

In [11]:
my_classifier = TwoStageClassifier(X_train=X_train, X_test=X_test, y_train=y_train)

In [12]:
clf0 = SVC()

In [13]:
my_classifier.set_clf0(clf0)

In [18]:
clfs = [SVC() for f in features]

In [19]:
my_classifier.set_clfs(clfs)

In [None]:
y_pred_0_stage = my_classifier.predict_0_stage()

In [None]:
y_test_0_stage = change_y_to_0_stage_version(y_test)

In [None]:
print_predicted_values_statistics(y_test=y_test_0_stage, y_pred=y_test_0_stage, title="Test data - first stage classification", algorithm=my_classifier.clf0)

In [None]:
y_pred_as_np = my_classifier.predict()

In [None]:
for (i, f) in enumerate(features):
    y_test_f = y_test_as_np[:,i]
    y_pred_f = y_pred_as_np[:,i]
    print_predicted_values_statistics(y_test=y_test_f, y_pred=y_pred_f, title="Tested category: "+str(f), algorithm=my_classifier.clfs[i])
    print()
    
print("Overall accuracy (subset) score (exact match ratio):", accuracy_score(y_test_as_np, y_pred_as_np))
print()
print("Total percent of labels correctly predicted:", percent_of_labels_correctly_predicted(y_test_as_np, y_pred_as_np))