In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, label_ranking_average_precision_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from collections import Counter
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

%matplotlib inline
cores = multiprocessing.cpu_count()

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels,on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

toxic_train_data = train_data[(train_data['toxic'] == 1) | (train_data['severe_toxic'] == 1) | (train_data['obscene'] == 1) |
                       (train_data['threat'] == 1) | (train_data['insult'] == 1) | (train_data['identity_hate'] == 1)]

toxic_test_data = test_merged[(test_merged['toxic'] == 1) | (test_merged['severe_toxic'] == 1) | (test_merged['obscene'] == 1) |
                       (test_merged['threat'] == 1) | (test_merged['insult'] == 1) | (test_merged['identity_hate'] == 1)]

In [3]:
y_train = toxic_train_data.iloc[:, 2:]
X_train = toxic_train_data[['comment_text']]

y_test = toxic_test_data.iloc[:, 2:]
X_test = toxic_test_data[['comment_text']]

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train_standard = tvec.fit_transform(X_train.copy()['comment_text'])
X_test_standard = tvec.transform(X_test.copy()['comment_text'])

In [4]:
import re
import string
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def lowercase(input_str):
    return input_str.lower()

def remove_numbers(input_str):
    return re.sub(r'\d+', '', input_str)

def remove_punctuation(input_str):
    return input_str.translate(str.maketrans(string.punctuation,' ' * len(string.punctuation)))

def remove_whitespaces(input_str):
    return input_str.strip()

def tokenize(input_str):
    return word_tokenize(input_str)

def porter_stemming(input_str):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in input_str]

def snowball_stemming(input_str):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(word) for word in input_str]

def lemmatization(input_str):
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in input_str]

def lancaster_stemming(input_str):
    stemmer = LancasterStemmer()
    return [stemmer.stem(word) for word in input_str]

def remove_stop_words(input_str):
    return [word for word in input_str if not word in ENGLISH_STOP_WORDS]

def preprocessing(dataframe):
    for index, row in dataframe.iterrows():
        row['comment_text'] = lowercase(row['comment_text'])
        row['comment_text'] = remove_numbers(row['comment_text'])
        row['comment_text'] = remove_punctuation(row['comment_text'])
        row['comment_text'] = remove_whitespaces(row['comment_text'])
        row['comment_text'] = tokenize(row['comment_text'])
        row['comment_text'] = remove_stop_words(row['comment_text'])
    return dataframe
    
def porter_stem(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = porter_stemming(row['comment_text'])
    return copy
    
def snowball_stem(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = snowball_stemming(row['comment_text'])
    return copy

def lemmatize(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = lemmatization(row['comment_text'])
    return copy

def lancaster_stem(dataframe):
    copy = dataframe.copy()
    
    for index, row in preprocessing(copy).iterrows():
        row['comment_text'] = lancaster_stemming(row['comment_text'])
    return copy


In [5]:
def identity_tokenizer(text):
    return text

tvec = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
X_train_porter_stem = porter_stem(X_train)
X_train_porter_stem = tvec.fit_transform(X_train_porter_stem['comment_text'])
X_test_porter_stem = porter_stem(X_test)
X_test_porter_stem = tvec.transform(X_test_porter_stem['comment_text'])

tvec = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
X_train_snowball_stem = snowball_stem(X_train)
X_train_snowball_stem = tvec.fit_transform(X_train_snowball_stem['comment_text'])
X_test_snowball_stem = snowball_stem(X_test)
X_test_snowball_stem = tvec.transform(X_test_snowball_stem['comment_text'])

tvec = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
X_train_lemmatize = lemmatize(X_train)
X_train_lemmatize = tvec.fit_transform(X_train_lemmatize['comment_text'])
X_test_lemmatize = lemmatize(X_test)
X_test_lemmatize = tvec.transform(X_test_lemmatize['comment_text'])

tvec = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
X_train_lancaster_stem = lancaster_stem(X_train)
X_train_lancaster_stem = tvec.fit_transform(X_train_lancaster_stem['comment_text'])
X_test_lancaster_stem = lancaster_stem(X_test)
X_test_lancaster_stem = tvec.transform(X_test_lancaster_stem['comment_text'])

In [9]:
def classify(classifier, X_train, y_train, X_test, y_test, title):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    print(title)
    print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
    print("Hamming loss: " + str(hamming_loss(y_test, y_pred)))
    print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))
    print("LRAP: " + str(label_ranking_average_precision_score(y_test, y_pred.toarray())))
    print()
    
def test_classifiers(X_train, y_train, X_test, y_test):
    classifiers = [BinaryRelevance(GaussianNB()), BinaryRelevance(MultinomialNB()), BinaryRelevance(LogisticRegression(solver='saga', random_state = 27, n_jobs = cores)),
                  BinaryRelevance(RandomForestClassifier(random_state = 27, n_jobs = cores)), BinaryRelevance(DecisionTreeClassifier(random_state = 27)),
                  BinaryRelevance(OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores))]
    titles = ["GaussianNB", "MultinomialNB", "LogisticRegression, solver = saga", "RandomForestClassifier",
             "DecisionTreeClassifier", "OneVsOneClassifier, LinearSVC"]
    for index, classifier in enumerate(classifiers):
        classify(classifier, X_train, y_train, X_test, y_test, titles[index])

In [12]:
print("Using TfidfVectorizer tokenizer")
test_classifiers(X_train_standard, y_train, X_test_standard, y_test)

Using TfidfVectorizer tokenizer
GaussianNB
Accuracy score: 0.038923594425756845
Hamming loss: 0.4663623258049015
F1 score: 0.6602677061947089
LRAP: 0.45383820990620427
MultinomialNB
Accuracy score: 0.4015697581291046
Hamming loss: 0.13940947194190828
F1 score: 0.7858618158151908
LRAP: 0.7947329898375096
LogisticRegression, solver = saga
Accuracy score: 0.44081371135671954
Hamming loss: 0.12552725719472477
F1 score: 0.818340057332087
LRAP: 0.8188458629220295
RandomForestClassifier
Accuracy score: 0.4263975652731059
Hamming loss: 0.1288643280474131
F1 score: 0.8127047537104652
LRAP: 0.8087762293769047
DecisionTreeClassifier
Accuracy score: 0.34118212397885633
Hamming loss: 0.1565753644081371
F1 score: 0.7933657971451523
LRAP: 0.7612575417801286
OneVsOneClassifier, LinearSVC
Accuracy score: 0.4411340701585776
Hamming loss: 0.12558065032836777
F1 score: 0.8222190384807935
LRAP: 0.8159572054033849


In [None]:
print("Using PorterStemmer")
test_classifiers(X_train_porter_stem, y_train, X_test_porter_stem, y_test)

In [None]:
print("Using SnowballStemmer")
test_classifiers(X_train_snowball_stem, y_train, X_test_snowball_stem, y_test)

In [None]:
print("Using WordNetLemmatizer")
test_classifiers(X_train_lemmatize, y_train, X_test_lemmatize, y_test)

In [7]:
print("Using LancasterStemmer")
test_classifiers(X_train_lancaster_stem, y_train, X_test_lancaster_stem, y_test)

Using LancasterStemmer
OneVsOneClassifier, LinearSVC
Accuracy score: 0.4172673394201506
Hamming loss: 0.13046612205670352
F1 score: 0.8174009240641745
LRAP: 0.8055636090198817
