# Insult Detection Model

### Project by 
#### Chirag Khurana, Pallavi S. Rawat, Shubham Goyal

In [None]:
import os
import re
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import itertools

from sklearn.metrics import auc, roc_auc_score, roc_curve

import pandas as pd
import numpy as np
import spacy

## Importing Data Set 

In [None]:
# Unprocessed Data
# full_df = pd.read_csv('../data/train.csv')
# verify_df = pd.read_csv('../data/impermium_verification_labels.csv')
# data = full_df.append(verify_df)
# data.Comment = [x[1: -1] for x in data.Comment]

# Unprocessed Data
# full_df = pd.read_csv('../data/train.csv')
# verify_df = pd.read_csv('../data/impermium_verification_labels.csv')
# pdata_df = full_df.append(verify_df)

# Processed Data
full_df = pd.read_csv('../data/processed/train.csv')
verify_df = pd.read_csv('../data/processed/impermium_verification_labels.csv')
pdata_df = full_df.append(verify_df)

pdata = pdata_df

In [None]:
pdata_df.head()

#### Trying cleaning stop words

In [None]:
nlp = spacy.load('en_core_web_sm')
def sanitize_wo_stopwords(sentence):
    doc = nlp(sentence)
    s = []
    for token in doc:
#         print(token.dep_)
        if str(token.pos_) != 'SPACE' and not token.is_stop:
            s.append(token.text)
    return ' '.join(s)

def sanitize_with_stopwords(sentence):
    doc = nlp(sentence)
    s = []
    for token in doc:
#         print(token.dep_)
        if str(token.pos_) != 'SPACE':
            s.append(token.text)
    return ' '.join(s)


def sanitize_with_lemma(sentence):
    doc = nlp(sentence)
    s = []
    for token in doc:
#         print(token.dep_)
        if str(token.pos_) != 'SPACE':
            s.append(token.lemma_)
    return ' '.join(s)

In [None]:
def preprocessing(sentence, bad_word_dict):
    i = 0;
    data = []
    cleanr = re.compile('<.*?>')
    regex = "[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    regex1 = "(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"
    regex2 = "(www | http: | https:)+[ ^\s]+[\w]"
    z=0
    s0 = sentence
    s0=s0.lower()
    s0 = s0.replace("\\\\n", " ")
    s0 = s0.replace("\\n", " ")
    s0 = s0.replace("\\t", " ")
    s0 = s0.replace("\\\\xc2", " ")
    s0 = s0.replace("\\\\xa0", " ")
    s0 = s0.replace("\\\\xa0", " ")
    s0 = s0.replace("\\[\\w]", ' ')
    s0 = re.sub(r"\\[a-zA-Z0-9.]*", "", s0)

    s0 = re.sub("([a-zA-Z0-9.?!#*])\\1\\1+", "\\1", s0)  # brooooook->brook

    s0 = re.sub(regex, "", s0)  # url
    s0 = re.sub(regex2, "", s0)  # http url
    s0 = re.sub(regex1, "", s0)
    s0 = re.sub(cleanr, '', s0)  # html tags

    string = ":-/)"
    ###REMOVING SMILEYS
    # s0=re.sub(string,"  smiley",s0);

    ##s0=re.sub("\[\]+","",s0)            #remove \
    s0 = s0.strip()
    s0 = s0.replace(" wont ", " will not ")
    s0 = s0.replace(" won't ", " will not ")
    s0 = s0.replace(" don't ", " do not ")
    s0 = s0.replace(" dont ", " do not ")
    s0 = s0.replace(" dnt ", " do not ")
    s0 = s0.replace(" didn't ", " did not ")
    s0 = s0.replace(" didnt ", " did not ")
    s0 = s0.replace("Didn't ", "Did not ")
    s0 = s0.replace(" i'll", " I will")
    s0 = s0.replace(" I'll", " I will")
    s0 = s0.replace("I'll", "I will")
    s0 = s0.replace(" cant", " can not")
    s0 = s0.replace(" can't", " can not")
    s0 = s0.replace(" shouldn't", " should not")
    s0 = s0.replace(" shouldnt", " should not")
    s0 = s0.replace(" im ", " i am ")
    s0 = s0.replace("ain't", "is not")
    s0 = s0.replace("aint", "is not")
    s0 = s0.replace("'ll", " will")
    s0 = s0.replace("'t[. ]", " not")
    #s0=s0.replace(" u ", " you ")
    s0 = s0.replace(" r ", " are ")
    s0 = s0.replace(" m ", " am ")
    s0 = s0.replace(" ur ", " your ")
    s0 = s0.replace(" u'r ", " you are ")
    # s0 = s0.replace(" you'r ", "you are ")
    # s0 = s0.replace("your ", "you are ")

    s0 = s0.replace("'ve", " have")
    s0 = s0.replace("'s", " is")
    s0 = s0.replace("'re", " are")
    s0 = s0.replace("'d", " would")
    s0 = re.sub("([a-zA-Z0-9.]+)\\1\\1+", " ", s0)  # lolololol->lol
    s0 = re.sub("[&*?!#^%`~$@]{4}", "-TOKEN-", s0)  # &*$!^@->>>>token
    s0 = s0.strip();
    # print("before       " + s0)
    for key, value in bad_word_dict.items():
        sk = s0.replace(" "+key," "+value+" ")
        if(sk!=s0):
            s0=sk
    s0 = re.sub("(@|#)[\w.]*", "-PRON-", s0)  # @username with YOU
    return s0

bad_dict = None

def build_badword_dict():
    global bad_dict
    badfile = open('../data/misc/ConvertedBadWords.txt')
    bad_dict = dict()
    for line in badfile:
        bw = line.split(',')
        if len(bw) == 2:
            bad_dict[bw[0]] = bw[1].strip()

def sanitize_bw_regx(text):
    global bad_dict
    if not bad_dict:
        build_badword_dict()
    return preprocessing(text, bad_dict)

In [None]:
pdata.Comment = [x[1: -1] for x in pdata_df.Comment]

pdata.Comment = [sanitize_wo_stopwords(x) for x in pdata.Comment]
# pdata.Comment = [sanitize_bw_regx(x) for x in pdata.Comment]
# pdata.Comment = [sanitize_with_stopwords(x) for x in pdata.Comment]
# pdata.Comment = [sanitize_with_lemma(x) for x in pdata.Comment]

## Understanding Data 

In [None]:
pdata.describe()

In [None]:
pdata_df.head()

In [None]:
pdata.head()

In [None]:
pdata_ni = pdata.query('Insult == 0')
pdata_i = pdata.query('Insult == 1')
pdata_ni.shape, pdata_i.shape

In [None]:
ptrain_ni, ptest_ni = train_test_split(pdata_ni, test_size=0.6)
print(ptrain_ni.shape, ptest_ni.shape)

ptrain_i, ptest_i = train_test_split(pdata_i, test_size=0.2)
print(ptrain_i.shape, ptest_i.shape)

In [None]:
# To train final model, to show full data

# ptrain_ni, ptest_ni = train_test_split(pdata_ni, test_size=0.5)
# print(ptrain_ni.shape, ptest_ni.shape)

# ptrain_i, ptest_i = train_test_split(pdata_i, test_size=0.05)
# print(ptrain_i.shape, ptest_i.shape)

### Splitting Dataset

In [None]:
# train, test = train_test_split(data, test_size=0.2)
# ptrain, ptest = train_test_split(pdata, test_size=0.2)
ptrain = ptrain_i.append(ptrain_ni)
ptest = ptest_i.append(ptest_ni)

## Feature Extraction

#### TFIDF as feature

In [None]:
tfidf_w = TfidfVectorizer(ngram_range=(1, 3), analyzer='word', use_idf=False, max_features=50000) 
tfidf_c = TfidfVectorizer(ngram_range=(3, 10), analyzer='char', use_idf=False, max_features=100000)

In [None]:
ptrain_data_w = tfidf_w.fit_transform(ptrain.Comment)
ptrain_data_c = tfidf_c.fit_transform(ptrain.Comment)

ptrain_data_w.shape, ptrain_data_c.shape

In [None]:
# Save the vectorizer object

# pickle.dump(tfidf_w, open("insult_tfidf_w.vectorizer", "wb" ))
# pickle.dump(tfidf_c, open("insult_tfidf_c.vectorizer", "wb" ))

## Classification of Insult

#### Helper Functions

In [None]:
def normalize_mat(mat):
    m = []
    for i, row in enumerate(mat):
        m.append([float(x / sum(row)) for x in row])
    return np.array(m)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

#     print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Multinomial NaiveBayes

In [None]:
insult_nb_w = MultinomialNB(alpha=0.01)
insult_nb_w.fit(ptrain_data_w, ptrain.Insult)

insult_nb_c = MultinomialNB(alpha=0.01)
insult_nb_c.fit(ptrain_data_c, ptrain.Insult)

ptest_data_w = tfidf_w.transform(ptest.Comment)
ptest_data_c = tfidf_c.transform(ptest.Comment)

predicted_nb_w = insult_nb_w.predict(ptest_data_w)
predicted_nb_c = insult_nb_c.predict(ptest_data_c)
print(np.mean(predicted_nb_w == ptest.Insult), np.mean(predicted_nb_c == ptest.Insult))
predicted_nb_w_prob = insult_nb_w.predict_proba(ptest_data_w)
predicted_nb_c_prob = insult_nb_c.predict_proba(ptest_data_c)

#### Confusion Matrix for NB Classifier on Word - N-grams

In [None]:
# Compute confusion matrix
cnf_matrix_w = confusion_matrix(ptest.Insult, predicted_nb_w)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix_w, classes=['Not Insult', 'Insult'],
#                       title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix_w, classes=['Not Insult', 'Insult'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

#### Confusion Matrix for NB Classifier on Character - N-grams

In [None]:
# Compute confusion matrix
cnf_matrix_c = confusion_matrix(ptest.Insult, predicted_nb_c)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix_c, classes=['Not Insult', 'Insult'],
#                       title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix_c, classes=['Not Insult', 'Insult'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

### LinearSVC

In [None]:
insult_svm_w = LinearSVC()
insult_svm_w.fit(ptrain_data_w, ptrain.Insult)

insult_svm_c = LinearSVC()
insult_svm_c.fit(ptrain_data_c, ptrain.Insult)

ptest_data_w = tfidf_w.transform(ptest.Comment)
ptest_data_c = tfidf_c.transform(ptest.Comment)

predicted_svm_w = insult_svm_w.predict(ptest_data_w)
predicted_svm_c = insult_svm_c.predict(ptest_data_c)
np.mean(predicted_svm_w == ptest.Insult), np.mean(predicted_svm_c == ptest.Insult)

In [None]:
# Save the SVM trained model

# pickle.dump(insult_svm_w, open("insult_classifier_svm_w.model", "wb" ))
# pickle.dump(insult_svm_c, open("insult_classifier_svm_c.model", "wb" ))

#### Confusion Matrix for SVM Classifier on Word - N-grams

In [None]:
# Compute confusion matrix
cnf_matrix_w = confusion_matrix(ptest.Insult, predicted_svm_w)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix_w, classes=['Not Insult', 'Insult'],
#                       title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix_w, classes=['Not Insult', 'Insult'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

#### Confusion Matrix for SVM Classifier on Character - N-grams

In [None]:
# Compute confusion matrix
cnf_matrix_c = confusion_matrix(ptest.Insult, predicted_svm_c)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix_c, classes=['Not Insult', 'Insult'],
#                       title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix_c, classes=['Not Insult', 'Insult'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

### Logistic Regression

In [None]:
insult_lr_w = LogisticRegression()
insult_lr_w.fit(ptrain_data_w, ptrain.Insult)

insult_lr_c = LogisticRegression()
insult_lr_c.fit(ptrain_data_c, ptrain.Insult)

ptest_data_w = tfidf_w.transform(ptest.Comment)
ptest_data_c = tfidf_c.transform(ptest.Comment)

predicted_lr_w = insult_lr_w.predict(ptest_data_w)
predicted_lr_c = insult_lr_c.predict(ptest_data_c)
print(np.mean(predicted_lr_w == ptest.Insult), np.mean(predicted_lr_c == ptest.Insult))
predicted_lr_w_prob = insult_lr_w.predict_proba(ptest_data_w)
predicted_lr_c_prob = insult_lr_c.predict_proba(ptest_data_c)

#### Confusion Matrix for Logistic Regression Classifier on Word - N-grams

In [None]:
# Compute confusion matrix
cnf_matrix_w = confusion_matrix(ptest.Insult, predicted_lr_w)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix_w, classes=['Not Insult', 'Insult'],
#                       title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix_w, classes=['Not Insult', 'Insult'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

#### Confusion Matrix for Logistic Regression Classifier on Character - N-grams

In [None]:
# Compute confusion matrix
cnf_matrix_c = confusion_matrix(ptest.Insult, predicted_lr_c)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix_c, classes=['Not Insult', 'Insult'],
#                       title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix_c, classes=['Not Insult', 'Insult'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(ptest.Insult, predicted_lr_c_prob[:, 1:])
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for insult classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
plt.show()

In [None]:
print(roc_auc_score(ptest.Insult, predicted_lr_c_prob[:, 1:]), roc_auc_score(ptest.Insult, predicted_lr_w_prob[:, 1:]))