In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer 
from IPython.display import display
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, label_ranking_average_precision_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from collections import Counter
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

%matplotlib inline
cores = multiprocessing.cpu_count()

In [3]:
test_data = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")
train_data = pd.read_csv("train.csv")
test_merged = pd.merge(test_data, test_labels,on='id')

test_merged = test_merged.drop(test_merged[test_merged.toxic == -1].index)

toxic_train_data = train_data[(train_data['toxic'] == 1) | (train_data['severe_toxic'] == 1) | (train_data['obscene'] == 1) |
                       (train_data['threat'] == 1) | (train_data['insult'] == 1) | (train_data['identity_hate'] == 1)]

toxic_test_data = test_merged[(test_merged['toxic'] == 1) | (test_merged['severe_toxic'] == 1) | (test_merged['obscene'] == 1) |
                       (test_merged['threat'] == 1) | (test_merged['insult'] == 1) | (test_merged['identity_hate'] == 1)]

In [4]:
y_train = toxic_train_data.iloc[:, 2:]
X_train = toxic_train_data[['comment_text']]

y_test = toxic_test_data.iloc[:, 2:]
X_test = toxic_test_data[['comment_text']]

lp = LabelPowerset()
y_train = lp.transform(y_train)
y_test = lp.transform(y_test)

tvec = TfidfVectorizer(min_df=.0025, stop_words='english', strip_accents='unicode', analyzer='word', max_features=1000)
X_train = tvec.fit_transform(X_train['comment_text'])
X_test = tvec.transform(X_test['comment_text'])

In [10]:
classifier = GaussianNB().fit(X_train.toarray(), y_train)
y_pred = classifier.predict(X_test.toarray())

print("GaussianNB")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

GaussianNB
Accuracy score: 0.033317315393240426
F1 score: 0.05226270985131518


In [11]:
classifier = MultinomialNB().fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("MultinomialNB")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

MultinomialNB
Accuracy score: 0.08649687650168188
F1 score: 0.03757710203924395


In [12]:
classifier = LogisticRegression(solver='saga', random_state = 27, n_jobs = cores).fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("LogisticRegression, solver = saga")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

LogisticRegression, solver = saga
Accuracy score: 0.0675957071920551
F1 score: 0.03646263448415717


In [13]:
classifier = RandomForestClassifier(random_state = 27, n_jobs = cores).fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("RandomForestClassifier")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

RandomForestClassifier
Accuracy score: 0.08249239147845587
F1 score: 0.04071882819709735


In [14]:
classifier = DecisionTreeClassifier(random_state = 27).fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("DecisionTreeClassifier")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

DecisionTreeClassifier
Accuracy score: 0.07720647124779753
F1 score: 0.05470179132176864


In [15]:
classifier = OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores).fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("OneVsOneClassifier, LinearSVC")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

OneVsOneClassifier, LinearSVC
Accuracy score: 0.06775588659298415
F1 score: 0.04041210504518198


In [16]:
counter = Counter(y_train)
for elem in counter.most_common():
    count = elem[1]
    while count < 6:
        count += 1
        counter.update({elem[0], 1})
        
ros = RandomOverSampler(sampling_strategy = counter, random_state = 27)
X_tmp, y_tmp = ros.fit_sample(X_train, y_train)

sm = SMOTE(random_state = 27, n_jobs = cores)
X_train_resampled, y_train_resampled = sm.fit_sample(X_tmp, y_tmp)

  n_samples_majority,


In [18]:
classifier = GaussianNB().fit(X_train_resampled.toarray(), y_train_resampled)
y_pred = classifier.predict(X_test.toarray())

print("GaussianNB resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

GaussianNB resampled
Accuracy score: 0.025468524747717443
F1 score: 0.03719505809398279


In [19]:
classifier = MultinomialNB().fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("MultinomialNB resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

MultinomialNB resampled
Accuracy score: 0.03315713599231139
F1 score: 0.049320245553750273


In [20]:
classifier = LogisticRegression(solver='saga', random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("LogisticRegression, solver = saga resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

LogisticRegression, solver = saga resampled
Accuracy score: 0.035399647605317955
F1 score: 0.05117801189731768


In [22]:
classifier = RandomForestClassifier(random_state = 27, n_jobs = cores).fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("RandomForestClassifier resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

RandomForestClassifier resampled
Accuracy score: 0.05157776709915105
F1 score: 0.04955811843935246


In [23]:
classifier = DecisionTreeClassifier(random_state = 27).fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("DecisionTreeClassifier resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

DecisionTreeClassifier resampled
Accuracy score: 0.05990709594746116
F1 score: 0.061515018814397714


In [21]:
classifier = OneVsOneClassifier(LinearSVC(random_state = 27), n_jobs = cores).fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test)

print("OneVsOneClassifier, LinearSVC resampled")
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred, average = 'weighted')))

OneVsOneClassifier, LinearSVC resampled
Accuracy score: 0.04372897645362806
F1 score: 0.05457094738804421
