In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
# from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from collections import Counter
%matplotlib inline

In [6]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels, on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

In [3]:
import re
import string
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def lowercase(input_str):
    return input_str.lower()

def remove_numbers(input_str):
    return re.sub(r'\d+', '', input_str)

def remove_punctuation(input_str):
    return input_str.translate(str.maketrans(string.punctuation,' ' * len(string.punctuation)))

def remove_whitespaces(input_str):
    return input_str.strip()

def tokenize(input_str):
    return word_tokenize(input_str)

def porter_stemming(input_str):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in input_str]

def snowball_stemming(input_str):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(word) for word in input_str]

def lemmatization(input_str):
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in input_str]

def lancaster_stemming(input_str):
    stemmer = LancasterStemmer()
    return [stemmer.stem(word) for word in input_str]

def remove_stop_words(input_str):
    return [word for word in input_str if not word in ENGLISH_STOP_WORDS]

def preprocessing(dataframe):
    for index, row in dataframe.iterrows():
        row['comment_text'] = lowercase(row['comment_text'])
        row['comment_text'] = remove_numbers(row['comment_text'])
        row['comment_text'] = remove_punctuation(row['comment_text'])
        row['comment_text'] = remove_whitespaces(row['comment_text'])
        row['comment_text'] = tokenize(row['comment_text'])
        row['comment_text'] = remove_stop_words(row['comment_text'])
    return dataframe
    
def porter_stem(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = porter_stemming(row['comment_text'])
    return copy
    
def snowball_stem(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = snowball_stemming(row['comment_text'])
    return copy

def lemmatize(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = lemmatization(row['comment_text'])
    return copy

def lancaster_stem(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = lancaster_stemming(row['comment_text'])
    return copy


In [7]:
cores = multiprocessing.cpu_count()

y_train = train_data.iloc[:, 2:]
X_train = train_data[['comment_text']]

y_test = test_merged.iloc[:, 2:]
X_test = test_merged[['comment_text']]

def identity_tokenizer(text):
    return text

tvec = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
X_train = lemmatize(X_train)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = lemmatize(X_test)
X_test = tvec.transform(X_test['comment_text'])

In [8]:
y_test_as_np = y_test.to_numpy()

In [9]:
features = y_train.columns

In [10]:
def change_y_to_0_stage_version(y):
    return [1 if any(row.values) else 0 for i, row in y.iterrows()]

In [11]:
class TwoStageClassifier:
    def __init__(self, X_train, X_test, y_train, clf0=None, clfs=None):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        
        if clf0 is None:
            clf0 = SVC()
        if clfs is None:
            clfs = [SVC() for i in range(self.y_train.shape[1])]
            
        self.clf0 = clf0
        self.clfs = clfs
        
        self.y_pred_0_stage = None
        self.y_pred_1_stage = None
        
    def set_clf0(self, clf0):
        self.clf0 = clf0
        self._clear_y_pred()
        
    def set_clfs(self, clfs):
        self.clfs = clfs
        self._clear_y_pred()

        
    def predict(self):
        return self.predict_1_stage()
    
    def predict_1_stage(self):
        if self.y_pred_1_stage is None:
            self._proceed_1_stage()
        return self.y_pred_1_stage
        
    def predict_0_stage(self):
        if self.y_pred_0_stage is None:
            self._proceed_0_stage()
        return self.y_pred_0_stage
    
    def _proceed_1_stage(self):
        if self.y_pred_0_stage is None:
            self._proceed_0_stage()
        
        features_num = self.y_train.shape[1]
        y_pred_as_np = np.zeros((self.X_test.shape[0], features_num))
        y_train_1_stages = [[] for i in range(features_num)]
        inds = []
        for i, row in self.y_train.iterrows():
            if not any(row.values):
                continue
            inds.append(i)
            for j, v in enumerate(row.values):
                y_train_1_stages[j].append(v)
                
        y_train_1_stages = np.array(y_train_1_stages)
        X_train_1_stage = self.X_train[inds]
        
        for (i, clf) in enumerate(self.clfs):
            clf.fit(X_train_1_stage, y_train_1_stages[i])
        
        y_pred_1_stages = [[] for i in range(features_num)]
        for (i, clf) in enumerate(self.clfs):
            y_pred_1_stages[i] = clf.predict(self.X_test)    # Do predictions for each class
        y_pred_1_stages = np.array(y_pred_1_stages)
        
        for (i, y0) in enumerate(self.y_pred_0_stage):
            if y0 == 0:
                y_pred_as_np[i,:] = 0                       # In case of first stage determining no label we write no labels there
            else:
                y_pred_as_np[i,:] = y_pred_1_stages[:,i]    # In case of first stage determining at least label we write labels from second stage
        
        self.y_pred_1_stage = y_pred_as_np
        
    def _proceed_0_stage(self):
        y_train_0_stage = change_y_to_0_stage_version(self.y_train)
        self.clf0.fit(self.X_train, y_train_0_stage)
        self.y_pred_0_stage = self.clf0.predict(self.X_test)
        
    def _clear_y_pred(self):
        self.y_pred_0_stage = None
        self.y_pred_1_stage = None

In [12]:
def print_predicted_values_statistics(y_test, y_pred, title="Classification results:", algorithm=None):
    print(title)
    if algorithm is not None:
        print("Used algorithm:", algorithm.__class__.__name__)
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy score: " + str(acc_score))
    f1_score_value = f1_score(y_test, y_pred, average = 'weighted')
    print("F1 score: " + str(f1_score_value))
    rec_score = recall_score(y_test, y_pred, average = 'weighted')
    print("Recall score: " + str(rec_score))
    print()
    return {"accuracy_score": acc_score, "f1_score": f1_score_value, "recall_score": rec_score}

In [13]:
# Returns 1.0 - Hamming-Loss measure - number of labels correctly classified
def percent_of_labels_correctly_predicted(y_test, y_pred):
    return np.sum(y_test == y_pred) / y_test.size

In [14]:
def test_two_stage_classifier_with_algorithms(clf0=None, clfs=None):
    my_classifier = TwoStageClassifier(X_train=X_train, X_test=X_test, y_train=y_train)
    if clf0 is not None:
        my_classifier.set_clf0(clf0)
    if clfs is not None:
        my_classifier.set_clfs(clfs)
    
    scores_dict = {}
    y_pred_0_stage = my_classifier.predict_0_stage()
    y_test_0_stage = change_y_to_0_stage_version(y_test)
    first_stage_scores = print_predicted_values_statistics(y_test=y_test_0_stage, y_pred=y_pred_0_stage, 
                                                           title="Test data - first stage classification", 
                                                           algorithm=my_classifier.clf0)
    print
    scores_dict['first_stage'] = first_stage_scores
    
    y_pred_as_np = my_classifier.predict()
    for (i, f) in enumerate(features):
        y_test_f = y_test_as_np[:,i]
        y_pred_f = y_pred_as_np[:,i]
        category_scores = print_predicted_values_statistics(y_test=y_test_f, y_pred=y_pred_f, 
                                                            title="Tested category: "+str(f), 
                                                            algorithm=my_classifier.clfs[i])
        scores_dict[str(f)] = category_scores
    
    overall_acc_score = accuracy_score(y_test_as_np, y_pred_as_np)
    print("Overall accuracy (subset) score (exact match ratio):", 
          overall_acc_score)
    scores_dict['overall_accuracy_score'] = overall_acc_score
    total_percent_of_correct_labels = percent_of_labels_correctly_predicted(y_test_as_np, y_pred_as_np)
    print("Total percent of labels correctly predicted:", 
          total_percent_of_correct_labels)
    scores_dict['total_correct_labels'] = total_percent_of_correct_labels
    return my_classifier, scores_dict

In [15]:
random_state = 44

In [16]:
mnb_clf0_ih = MultinomialNB()
mnb_clfs_ih = [MultinomialNB() for f in features]
mnb_clfs_ih[-1] = LogisticRegression(solver='saga', random_state=random_state)  # LR for identity_hate

In [17]:
mnb_for_both_stages_except_ih_classifier, mnb_for_both_stages_except_ih_results = \
    test_two_stage_classifier_with_algorithms(clf0=mnb_clf0_ih, clfs=mnb_clfs_ih)

Test data - first stage classification
Used algorithm: MultinomialNB
Accuracy score: 0.9237550407952734
F1 score: <function f1_score at 0x7f034bbea9d8>
Recall score: 0.9237550407952734

Tested category: toxic
Used algorithm: MultinomialNB
Accuracy score: 0.9258026196505048
F1 score: <function f1_score at 0x7f034bbea9d8>
Recall score: 0.9258026196505048

Tested category: severe_toxic
Used algorithm: MultinomialNB
Accuracy score: 0.994248022757823
F1 score: <function f1_score at 0x7f034bbea9d8>
Recall score: 0.994248022757823

Tested category: obscene
Used algorithm: MultinomialNB
Accuracy score: 0.9597205289318204
F1 score: <function f1_score at 0x7f034bbea9d8>
Recall score: 0.9597205289318204

Tested category: threat
Used algorithm: MultinomialNB
Accuracy score: 0.9967019913095126
F1 score: <function f1_score at 0x7f034bbea9d8>
Recall score: 0.9967019913095126

Tested category: insult
Used algorithm: MultinomialNB
Accuracy score: 0.958548250961268
F1 score: <function f1_score at 0x7f03

In [18]:
mnb_svc_clf0 = MultinomialNB()
mnb_svc_clfs = [SVC(random_state=random_state) for f in features]

In [None]:
mnb_svc_classifier, mnb_svc_results = \
    test_two_stage_classifier_with_algorithms(clf0=mnb_svc_clf0, clfs=mnb_svc_clfs)

Test data - first stage classification
Used algorithm: MultinomialNB
Accuracy score: 0.9237550407952734
F1 score: <function f1_score at 0x7f034bbea9d8>
Recall score: 0.9237550407952734

