In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import re, string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score
from statistics import mean


In [None]:
start = time.time()

# reading the data file
data = pd.read_csv('dataset.csv')

# 6 class labels
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


# assignning comment and id to X
cols = [0,1]
X = data[data.columns[cols]]


# assigning class-labels to Y
cols1 = [2,3,4,5,6,7]
Y = data[data.columns[cols1]]


# splitting the data

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)


In [None]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

# using tf-idf
tf_idf = TfidfVectorizer(ngram_range=(1,2), sublinear_tf=1, strip_accents='unicode', tokenizer=tokenize, 
                         min_df=3, max_df=0.9, use_idf=1, smooth_idf=1)

# fit and transform after tf_idf
train_tf_idf = tf_idf.fit_transform(X_train["comment_text"])
test_tf_idf = tf_idf.transform(X_test["comment_text"])


In [None]:
# contains functions for ensemble learning

x = train_tf_idf
test_x = test_tf_idf

# nb classifier
def naive_bayes(y_i, y):
    prob = x[y == y_i].sum(0)
    return (prob + 1) / ( (y == y_i).sum() + 1 )

# ensemble of naive-bayes and logistic regression classifiers
def ensemble(y):
    y = y.values
    res = np.log(naive_bayes(1,y) / naive_bayes(0,y))
    
    classifier = LogisticRegression(C=4)
    
    nb = x.multiply(res)
    
    return classifier.fit(nb, y), res


In [None]:
# variables for storing results
preds = np.zeros((len(X_test), len(label_cols)))
preds1 = np.zeros((len(X_test), len(label_cols)))

# for loop for each class
for i, j in enumerate(label_cols):
    # print each class-label
    print('fit', j)
    
    # calling ensemble function
    m,r = ensemble(y_train[j])
    
    # generating outputs
    preds1[:,i] = m.predict(test_x.multiply(r))
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


In [None]:
# taking the values
y_train_new = y_train.values
y_test_new = y_test.values 
y_score = preds

# generating precision-recall curve
precision = dict()
recall = dict()

for i in range(6):
    precision[i], recall[i], _ = precision_recall_curve(y_test_new[:, i], y_score[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))

plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("Precision-Recall Curve")
plt.show()


# generating roc curve and roc score
fpr = dict()
tpr = dict()
roc_score=[]

for i in range(6):
    roc_score.append(roc_auc_score(y_test_new[:, i], y_score[:, i]))
#     print(i," ",roc_auc_score(y_test_new[:, i], y_score[:, i]))
    
    fpr[i], tpr[i], _ = roc_curve(y_test_new[:, i], y_score[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))

plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve for Ensemble-learning")
plt.show()
print("ROC Score: ",mean(roc_score))


In [None]:
# generating classification report
print(classification_report(y_test.values,preds1,target_names=label_cols))


In [None]:
# calculating overall accuracy by checking the predicted output against the given output

a=y_test.values.tolist()
b=preds1.tolist()
count=0
for i in range(len(y_test)):
    if a[i] == b[i]:
        count+=1
print("Accuracy: ",count/len(y_test)*100)


In [None]:
end = time.time()

print("Time: ",(end-start))
