In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_raw = pd.read_csv('trainMaster.csv')

## Comments per Category

In [3]:
categories = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [4]:

# sns.set(font_scale = 2)
# plt.figure(figsize=(15,8))
# ax= sns.barplot(categories, data_raw.iloc[:,2:].sum().values)
# plt.title("Comments in each category", fontsize=24)
# plt.ylabel('Number of comments', fontsize=18)
# plt.xlabel('Comment Type ', fontsize=18)
# #adding the text labels
# rects = ax.patches
# labels = data_raw.iloc[:,2:].sum().values
# for rect, label in zip(rects, labels):
#     height = rect.get_height()
#     ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
# plt.show()


## Comments with Multiple Labels

In [5]:
# rowSums = data_raw.iloc[:,2:].sum(axis=1)
# multiLabel_counts = rowSums.value_counts()
# multiLabel_counts = multiLabel_counts.iloc[1:]
# sns.set(font_scale = 2)
# plt.figure(figsize=(15,8))
# ax = sns.barplot(multiLabel_counts.index, multiLabel_counts.values)
# plt.title("Comments having multiple labels ")
# plt.ylabel('Number of comments', fontsize=18)
# plt.xlabel('Number of labels', fontsize=18)
# #adding the text labels
# rects = ax.patches
# labels = multiLabel_counts.values
# for rect, label in zip(rects, labels):
#     height = rect.get_height()
#     ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
# plt.show()

# Data Pre-processing

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings
data = data_raw
if not sys.warnoptions:
    warnings.simplefilter("ignore")



In [7]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned
def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def clean_total(sentence):
    return cleanHtml(sentence), cleanPunc(sentence), keepAlpha(sentence)

In [8]:
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)

In [9]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)
data['comment_text'] = data['comment_text'].apply(removeStopWords)

# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train['comment_text'])
# vectorizer.fit(test['comment_text'])
x_train = vectorizer.transform(train['comment_text'])
y_train = train.drop(labels = ['id','comment_text'], axis=1)
x_test = vectorizer.transform(test['comment_text'])
y_test = test.drop(labels = ['id','comment_text'], axis=1)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, matthews_corrcoef, f1_score, log_loss
from sklearn.multiclass import OneVsRestClassifier

# One vs Rest

In [12]:
len(train)

111699

In [13]:
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', class_weight='automatic'))),
            ])

def pipeline_model_training(category, dataTrain, targetTrain):
    # Training logistic regression model on train data
    return LogReg_pipeline.fit(dataTrain, targetTrain)

def pipeline_predict(category, dictPred, dictPredProba, dictHasil, dataTest, targetTest):
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(dataTest)
    dictPredProba['{}'.format(category)] = LogReg_pipeline.predict_proba(dataTest)
    dictPred[category] = prediction 
    dictHasil['{}_accuracy'.format(category)] = accuracy_score(targetTest, prediction)
    dictHasil['{}_f1score'.format(category)] = f1_score(targetTest, prediction, average = 'macro')
    dictHasil['{}_logloss'.format(category)] = log_loss(targetTest, prediction)

In [14]:
dHasil_OvR = {}
dPrediction_OvR = {}
dPredictionProba_OvR = {}

for category in categories:
    pipeline_model_training(category, x_train, train[category])
    pipeline_predict(category, dPrediction_OvR, dPredictionProba_OvR, dHasil_OvR, x_train, train[category])

In [15]:
dHasil_OvR

{'toxic_accuracy': 0.948298552359466,
 'toxic_f1score': 0.8027261959430682,
 'toxic_logloss': 1.7857050901161455,
 'severe_toxic_accuracy': 0.9909220315311685,
 'severe_toxic_f1score': 0.6232705830962695,
 'severe_toxic_logloss': 0.31354245995261193,
 'obscene_accuracy': 0.9724617051182195,
 'obscene_f1score': 0.8186531737208504,
 'obscene_logloss': 0.9511394387290054,
 'threat_accuracy': 0.9969471526155114,
 'threat_f1score': 0.5021510664295756,
 'threat_logloss': 0.10544161318064312,
 'insult_accuracy': 0.968218157727464,
 'insult_f1score': 0.7633842117615259,
 'insult_logloss': 1.097707182090157,
 'identity_hate_accuracy': 0.9916830052193842,
 'identity_hate_f1score': 0.5508465506756818,
 'identity_hate_logloss': 0.28725890174884167}

In [16]:
listItem = []

for i in range(len(dPrediction_OvR['toxic'])):
    listItem.append([dPrediction_OvR['toxic'][i],
                    dPrediction_OvR['severe_toxic'][i],
                    dPrediction_OvR['obscene'][i],
                    dPrediction_OvR['threat'][i],
                    dPrediction_OvR['insult'][i],
                    dPrediction_OvR['identity_hate'][i],])

dfPredovr = pd.DataFrame(columns=['ToxicPred', 'SToxicPred','ObscenePred','ThreatPred','InsultPred','IdHatePred'],
                     data=listItem)

In [17]:
dfResultovr = pd.DataFrame

dfResultovr = pd.concat([testLabel,dfPred], axis=1)

NameError: name 'testLabel' is not defined

In [None]:
dataToxicovr = dfResultovr[(dfResult['toxic'] != 0) | (dfResultovr['severe_toxic'] != 0) | (dfResultovr['obscene'] != 0)
         | (dfResultovr['threat'] != 0) | (dfResultovr['insult'] != 0) | (dfResultovr['identity_hate'] != 0)]

tebakanToxicTrueovr = dfResultovr[((dfResult['ToxicPred'] == dfResultovr['toxic']) & (dfResultovr['SToxicPred'] == dfResultovr['severe_toxic']) & (dfResultovr['ObscenePred'] == dfResultovr['obscene'])
         & (dfResultovr['ThreatPred'] == dfResultovr['threat']) & (dfResultovr['InsultPred'] == dfResultovr['insult']) & (dfResultovr['IdHatePred'] == dfResultovr['identity_hate'])) & 
             ((dfResultovr['toxic'] != 0) | (dfResultovr['severe_toxic'] != 0) | (dfResultovr['obscene'] != 0)
         | (dfResultovr['threat'] != 0) | (dfResultovr['insult'] != 0) | (dfResultovr['identity_hate'] != 0))]

tebakanCleanTrueovr = dfResultovr[((dfResultovr['ToxicPred'] == dfResultovr['toxic']) & (dfResultovr['SToxicPred'] == dfResultovr['severe_toxic']) & (dfResultovr['ObscenePred'] == dfResultovr['obscene'])
         & (dfResultovr['ThreatPred'] == dfResultovr['threat']) & (dfResultovr['InsultPred'] == dfResultovr['insult']) & (dfResultovr['IdHatePred'] == dfResultovr['identity_hate'])) & 
             ((dfResultovr['toxic'] == 0) & (dfResultovr['severe_toxic'] == 0) & (dfResultovr['obscene']== 0)
         & (dfResultovr['threat'] == 0) & (dfResultovr['insult'] == 0) & (dfResultovr['identity_hate'] == 0))]

tebakanTrueovr = dfResultovr[(dfResultovr['ToxicPred'] == dfResultovr['toxic']) & (dfResultovr['SToxicPred'] == dfResultovr['severe_toxic']) & (dfResultovr['ObscenePred'] == dfResultovr['obscene'])
         & (dfResultovr['ThreatPred'] == dfResultovr['threat']) & (dfResultovr['InsultPred'] == dfResultovr['insult']) & (dfResultovr['IdHatePred'] == dfResultovr['identity_hate'])]

In [None]:
len(dfResultovr)

In [None]:
print('Persentasi Tebakan Benar = ', round(len(tebakanTrueovr)/len(dfResultovr)*100, 2),'%')
print('Persentasi Tebakan Toxic Benar = ', round(len(tebakanToxicTrueovr)/len(dataToxicovr)*100, 2),'%')

In [None]:
len(tebakanTrueovr)

## Test

In [None]:
testLabel = pd.read_csv('testLabelFix.csv')

In [None]:
dfTest = pd.read_csv('testDataFix.csv')

In [None]:
dfTest['comment_text'] = dfTest['comment_text'].str.lower()
dfTest['comment_text'] = dfTest['comment_text'].apply(cleanHtml)
dfTest['comment_text'] = dfTest['comment_text'].apply(cleanPunc)
dfTest['comment_text'] = dfTest['comment_text'].apply(keepAlpha)
dfTest['comment_text'] = dfTest['comment_text'].apply(removeStopWords)

In [None]:
dataTest = vectorizer.transform(dfTest['comment_text'])

In [None]:
dHasil_test = {}
dPrediction_test = {}
dPredictionProba_test = {}

for category in categories:
    pipeline_predict(category, dPrediction_test, dPredictionProba_test, dHasil_test, dataTest, testLabel[category])

In [None]:
for category in categories:    
    print(dHasil_test['{}_accuracy'.format(category)])

In [None]:
listItem = []

for i in range(len(dPrediction_test['toxic'])):
    listItem.append([dPrediction_test['toxic'][i],
                    dPrediction_test['severe_toxic'][i],
                    dPrediction_test['obscene'][i],
                    dPrediction_test['threat'][i],
                    dPrediction_test['insult'][i],
                    dPrediction_test['identity_hate'][i],])

dfPred = pd.DataFrame(columns=['ToxicPred', 'SToxicPred','ObscenePred','ThreatPred','InsultPred','IdHatePred'],
                     data=listItem)

In [None]:
dfResult = pd.DataFrame

dfResult = pd.concat([testLabel,dfPred], axis=1)

In [None]:
a = len(dfResult[(dfResult['ToxicPred'] == dfResult['toxic']) & (dfResult['SToxicPred'] == dfResult['severe_toxic']) & (dfResult['ObscenePred'] == dfResult['obscene'])
         & (dfResult['ThreatPred'] == dfResult['threat']) & (dfResult['InsultPred'] == dfResult['insult']) & (dfResult['IdHatePred'] == dfResult['identity_hate'])])

In [None]:
print('Persentasi Tebakan Benar = ', round(a/len(dfResult)*100, 2),'%')

In [None]:
len(dfResult[(dfResult['toxic'] == 0) & (dfResult['severe_toxic'] == 0) & (dfResult['obscene']== 0)
         & (dfResult['threat'] == 0) & (dfResult['insult'] == 0) & (dfResult['identity_hate'] == 0)])

In [None]:
len(dfResult[(dfResult['toxic'] != 0) | (dfResult['severe_toxic'] != 0) | (dfResult['obscene'] != 0)
         | (dfResult['threat'] != 0) | (dfResult['insult'] != 0) | (dfResult['identity_hate'] != 0)])

In [None]:
dataToxic = dfResult[(dfResult['toxic'] != 0) | (dfResult['severe_toxic'] != 0) | (dfResult['obscene'] != 0)
         | (dfResult['threat'] != 0) | (dfResult['insult'] != 0) | (dfResult['identity_hate'] != 0)]

tebakanToxicTrue = dfResult[((dfResult['ToxicPred'] == dfResult['toxic']) & (dfResult['SToxicPred'] == dfResult['severe_toxic']) & (dfResult['ObscenePred'] == dfResult['obscene'])
         & (dfResult['ThreatPred'] == dfResult['threat']) & (dfResult['InsultPred'] == dfResult['insult']) & (dfResult['IdHatePred'] == dfResult['identity_hate'])) & 
             ((dfResult['toxic'] != 0) | (dfResult['severe_toxic'] != 0) | (dfResult['obscene'] != 0)
         | (dfResult['threat'] != 0) | (dfResult['insult'] != 0) | (dfResult['identity_hate'] != 0))]

tebakanCleanTrue = dfResult[((dfResult['ToxicPred'] == dfResult['toxic']) & (dfResult['SToxicPred'] == dfResult['severe_toxic']) & (dfResult['ObscenePred'] == dfResult['obscene'])
         & (dfResult['ThreatPred'] == dfResult['threat']) & (dfResult['InsultPred'] == dfResult['insult']) & (dfResult['IdHatePred'] == dfResult['identity_hate'])) & 
             ((dfResult['toxic'] == 0) & (dfResult['severe_toxic'] == 0) & (dfResult['obscene']== 0)
         & (dfResult['threat'] == 0) & (dfResult['insult'] == 0) & (dfResult['identity_hate'] == 0))]

tebakanTrue = dfResult[(dfResult['ToxicPred'] == dfResult['toxic']) & (dfResult['SToxicPred'] == dfResult['severe_toxic']) & (dfResult['ObscenePred'] == dfResult['obscene'])
         & (dfResult['ThreatPred'] == dfResult['threat']) & (dfResult['InsultPred'] == dfResult['insult']) & (dfResult['IdHatePred'] == dfResult['identity_hate'])]

In [None]:
len(tebakanToxicTrue)

In [None]:
print('Persentasi Tebakan Benar = ', round(len(tebakanTrue)/len(dfResult)*100, 2),'%')
print('Persentasi Tebakan Toxic Benar = ', round(len(tebakanToxicTrue)/len(dataToxic)*100, 2),'%')

## Pake Proba

In [None]:
test_toxic = []
test_severe_toxic = []
test_obscene = []
test_threat = []
test_insult = []
test_identity_hate = []
lTest = [test_toxic, test_severe_toxic, test_obscene, test_threat, test_insult, test_identity_hate]
for category, item in zip(categories,lTest):
    for i in range(len(dPredictionProba_test['toxic'])):
        if dPredictionProba_test[category][i][1] >= 0.00:
            item.append(1)
        else:
            item.append(0)

In [None]:
listItem = []

for i in range(len(dPredictionProba_test['toxic'])):
    listItem.append([test_toxic[i],
                    test_severe_toxic[i],
                    test_obscene[i],
                    test_threat[i],
                    test_insult[i],
                    test_identity_hate[i]])

dfPredProba = pd.DataFrame(columns=['ToxicPred', 'SToxicPred','ObscenePred','ThreatPred','InsultPred','IdHatePred'],
                     data=listItem)

dfResultProba = pd.DataFrame
dfResultProba = pd.concat([testLabel,dfPredProba], axis=1)

In [None]:
dataToxicProba = dfResultProba[(dfResult['toxic'] != 0) | (dfResultProba['severe_toxic'] != 0) | (dfResultProba['obscene'] != 0)
             | (dfResultProba['threat'] != 0) | (dfResultProba['insult'] != 0) | (dfResultProba['identity_hate'] != 0)]

tebakanToxicTrueProba = dfResultProba[((dfResult['ToxicPred'] == dfResultProba['toxic']) & (dfResult['SToxicPred'] == dfResultProba['severe_toxic']) & (dfResultProba['ObscenePred'] == dfResultProba['obscene'])
         & (dfResultProba['ThreatPred'] == dfResultProba['threat']) & (dfResult['InsultPred'] == dfResultProba['insult']) & (dfResult['IdHatePred'] == dfResultProba['identity_hate'])) & 
             ((dfResultProba['toxic'] != 0) | (dfResultProba['severe_toxic'] != 0) | (dfResultProba['obscene'] != 0)
         | (dfResultProba['threat'] != 0) | (dfResultProba['insult'] != 0) | (dfResultProba['identity_hate'] != 0))]

tebakanCleanTrueProba = dfResultProba[((dfResult['ToxicPred'] == dfResultProba['toxic']) & (dfResultProba['SToxicPred'] == dfResultProba['severe_toxic']) & (dfResultProba['ObscenePred'] == dfResultProba['obscene'])
         & (dfResultProba['ThreatPred'] == dfResultProba['threat']) & (dfResultProba['InsultPred'] == dfResultProba['insult']) & (dfResultProba['IdHatePred'] == dfResultProba['identity_hate'])) & 
             ((dfResultProba['toxic'] == 0) & (dfResultProba['severe_toxic'] == 0) & (dfResultProba['obscene']== 0)
         & (dfResultProba['threat'] == 0) & (dfResultProba['insult'] == 0) & (dfResultProba['identity_hate'] == 0))]

tebakanTrueProba = dfResultProba[(dfResultProba['ToxicPred'] == dfResultProba['toxic']) & (dfResultProba['SToxicPred'] == dfResultProba['severe_toxic']) & (dfResultProba['ObscenePred'] == dfResultProba['obscene'])
         & (dfResultProba['ThreatPred'] == dfResultProba['threat']) & (dfResultProba['InsultPred'] == dfResultProba['insult']) & (dfResultProba['IdHatePred'] == dfResultProba['identity_hate'])]

In [None]:
print('Persentasi Tebakan Benar = ', round(len(tebakanTrueProba)/len(dfResultProba)*100, 2),'%')
print('Persentasi Tebakan Toxic Benar = ', round(len(tebakanToxicTrueProba)/len(dataToxicProba)*100, 2),'%')