In [0]:
!pip install -U -q PyDrive

# Code to read csv file into Colaboratory:!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials# Authenticate and create the PyDrive clie`nt.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import nltk
nltk.download('stopwords')

In [0]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import auc, roc_curve, f1_score, confusion_matrix
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

from keras.models import Sequential
from keras.layers import Dense

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
np.random.seed(123)

In [0]:
#download dataset

id = "1h2MguzOoQwN9Libp7YNzD-GmEV0j8jP5"
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('offense_dataset.csv')  
df = pd.read_csv('offense_dataset.csv')

In [0]:
df.head()

In [0]:
train_set = df.sample(frac=0.7)
test_set = df[~df.index.isin(train_set.index)]

In [0]:
#feature vectorizer
count_vec = CountVectorizer(binary=True)
train_X = count_vec.fit_transform(train_set["comment_text"])

#label extranction
train_Y = train_set["insult"]

#test dataset
test_X = count_vec.transform(test_set["comment_text"])
test_Y = test_set["insult"]

In [0]:
model = SGDClassifier(loss='log')
model.fit(train_X, train_Y)

pred_Y = model.predict(test_X)
#proba_Y = model.predict_proba(test_X)[:,1]
#fpr, tpr, thresholds = roc_curve(test_Y, proba_Y)
#roc_auc = auc(fpr, tpr)
#print(roc_auc)
print("f1 score", f1_score(test_Y, pred_Y))

In [0]:
model = LogisticRegression()
model.fit(train_X, train_Y)
pred_Y = model.predict(test_X)
proba_Y = model.predict_proba(test_X)[:,1]
fpr, tpr, thresholds = roc_curve(test_Y, proba_Y)
roc_auc = auc(fpr, tpr)
print("auc", roc_auc)
print("f1 score", f1_score(test_Y, pred_Y))

# Removal of Stopwords

In this section we demonstrate the effect of removing stopwords from the feature space


In [0]:
def remove_stopword(sentence):
  return " ".join([x.strip(".',!?-<>") for x in sentence.split() if x not in stopWords])

#feature vectorizer
count_vec = CountVectorizer(binary=True)
train_X = count_vec.fit_transform(train_set["comment_text"].apply(remove_stopword))

#label extranction
train_Y = train_set["insult"]

#test dataset
test_X = count_vec.transform(test_set["comment_text"].apply(remove_stopword))
test_Y = test_set["insult"]

In [0]:
print("SGD Classifier")
model = SGDClassifier(loss='log')
model.fit(train_X, train_Y)

pred_Y = model.predict(test_X)
#proba_Y = model.predict_proba(test_X)[:,1]
#fpr, tpr, thresholds = roc_curve(test_Y, proba_Y)
#roc_auc = auc(fpr, tpr)
#print(roc_auc)
print("f1 score", f1_score(test_Y, pred_Y))


print("Logistic Regression")
model = LogisticRegression()
model.fit(train_X, train_Y)
pred_Y = model.predict(test_X)
proba_Y = model.predict_proba(test_X)[:,1]
fpr, tpr, thresholds = roc_curve(test_Y, proba_Y)
roc_auc = auc(fpr, tpr)
print("auc", roc_auc)
print("f1 score", f1_score(test_Y, pred_Y))

# Changing the Vectorizer to Tf-Idf

In [0]:
def remove_stopword(sentence):
  return " ".join([x.strip(".',!?-<>") for x in sentence.split() if x not in stopWords])

#feature vectorizer
tfidf_vec = TfidfVectorizer(binary=True,
                            smooth_idf=True,
                            use_idf = True,
                            sublinear_tf = False
                           )
train_X = tfidf_vec.fit_transform(train_set["comment_text"])

#label extranction
train_Y = train_set["insult"]

#test dataset
test_X = tfidf_vec.transform(test_set["comment_text"])
test_Y = test_set["insult"]

In [0]:
print("SGD Classifier")
model = SGDClassifier(loss='log')
model.fit(train_X, train_Y)

pred_Y = model.predict(test_X)
#proba_Y = model.predict_proba(test_X)[:,1]
#fpr, tpr, thresholds = roc_curve(test_Y, proba_Y)
#roc_auc = auc(fpr, tpr)
#print(roc_auc)
print("f1 score", f1_score(test_Y, pred_Y))


print("Logistic Regression")
model = LogisticRegression(penalty='l2', class_weight='balanced')
model.fit(train_X, train_Y)
pred_Y = model.predict(test_X)
proba_Y = model.predict_proba(test_X)[:,1]
fpr, tpr, thresholds = roc_curve(test_Y, proba_Y)
roc_auc = auc(fpr, tpr)
print("auc", roc_auc)
print("f1 score", f1_score(test_Y, pred_Y))

# Trying Out Neural Network

In [0]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
def remove_stopword(sentence):
  return " ".join([x.strip(".',!?-<>") for x in sentence.split() if x not in stopWords])

# #feature vectorizer
# count_vec = TfidfVectorizer(lowercase=True,
#                             max_features=5000,
#                             norm=None
#                            )

count_vec = CountVectorizer(binary=True)
train_X = count_vec.fit_transform(train_set["comment_text"].apply(remove_stopword))

#label extranction
train_Y = train_set["insult"]

#test dataset
test_X = count_vec.transform(test_set["comment_text"].apply(remove_stopword))
test_Y = test_set["insult"]

In [0]:
model = Sequential()
input_dim = test_X.shape[1]
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])

In [0]:
model.fit(train_X, train_Y, epochs=20, batch_size = 2000)

In [0]:
pred_Y = [round(x[0]) for x in model.predict(test_X)]
#proba_Y = model.predict_proba(test_X)[:,1]
#fpr, tpr, thresholds = roc_curve(test_Y, proba_Y)
#roc_auc = auc(fpr, tpr)
#print("auc", roc_auc)
print("f1 score", f1_score(test_Y, pred_Y))
print("conf. matrix", confusion_matrix(test_Y, pred_Y).ravel())

# Inserting Embeddings to our Model

In [0]:
#download dataset

id = "1XeMRO9CTTqcSLRQK17mm6GRum1-ZSdb6"
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('glove.6B.50d.txt')

In [0]:
word_vectors = {}
with open('glove.6B.50d.txt') as f:
    for line in f.read().splitlines():
        tokens = line.split()
        word = tokens[0]
        vector = tokens[1:]
        word_vectors[word] = vector

## Build our vocab from training data

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, Lambda, Dropout
from keras.layers import LSTM
import keras.backend as K

In [0]:
x_train = train_set['comment_text'].values
y_train = train_set[['identity_hate', 'insult', 'obscene']].astype(int).values

x_test = test_set['comment_text'].values
y_test = test_set[['identity_hate', 'insult', 'obscene']].astype(int).values

In [0]:
tokenizer = Tokenizer(num_words=25000, oov_token='<unk>')
tokenizer.fit_on_texts(x_train)

In [0]:
vocab = [w for i, w in tokenizer.index_word.items() if i <= 25000]
wordToIndex = {w: i+1 for i, w in enumerate(vocab)}

## Find the best maxlen for our input

In [0]:
train_seq = tokenizer.texts_to_sequences(x_train)
sent_length = [len(s) for s in train_seq]

In [0]:
plt.boxplot(sent_length, vert=False)
plt.show()

In [0]:
plt.hist(sent_length, bins=100)
plt.show()

In [0]:
maxlen = 200

## Prepare the embedding matrix

In [0]:
embedding_matrix = np.zeros((len(wordToIndex)+2, 50))

for word, idx in wordToIndex.items():
    if word in word_vectors:
        embedding_matrix[idx] = word_vectors[word]

In [0]:
embedding_matrix.shape

In [0]:
n_vocab, embed_dim = embedding_matrix.shape

## Let's put the pre-trained embedding matrix in our model

In [0]:
# Deep Averaging Network
model = Sequential()
model.add(Embedding(input_dim=n_vocab, output_dim=embed_dim, input_length=maxlen, weights=[embedding_matrix]))
model.add(Lambda(lambda x: K.mean(x, axis=1)))
model.add(Dense(512))
model.add(Dense(512))
model.add(Dense(3, activation='sigmoid'))
model.compile(loss='binary_crossentropy', metrics=['acc', f1], optimizer='adam')
model.summary()

## Transfrom the dataset into sequence of integers instead of list of string

In [0]:
x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=maxlen)
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=maxlen)
print('x_train: ', x_train.shape)
print('x_test:', x_test.shape)

## Train the model

In [0]:
model.fit(x_train, y_train, epochs=5, batch_size=128)

## Test the model

In [0]:
loss, ac, f = model.evaluate(x_test, y_test, batch_size=128)
print('Loss: ', loss)
print('Accuracy: ', ac)
print('F1: ', f)

# Multi-Class Classification

In [0]:
mapping ={
    (0,0,0):0,
    (0,0,1):1,
    (0,1,0):2,
    (0,1,1):3,
    (1,0,0):4,
    (1,0,1):5,
    (1,1,0):6,
    (1,1,1):7
}

In [0]:
#feature vectorizer
count_vec = CountVectorizer(binary=True)
train_X = count_vec.fit_transform(train_set["comment_text"])

#label extranction
train_Y1 = train_set["insult"]
train_Y2 = train_set["obscene"]
train_Y3 = train_set["identity_hate"]

#test dataset
test_X = count_vec.transform(test_set["comment_text"])
test_Y1 = test_set["insult"]
test_Y2 = train_set["obscene"]
test_Y3 = train_set["identity_hate"]
test_Y = [mapping[x] for x in zip(test_Y1, test_Y2, test_Y3)]

In [0]:
model_1 = SGDClassifier(loss='log')
model_2 = SGDClassifier(loss='log')
model_3 = SGDClassifier(loss='log')

model_1.fit(train_X, train_Y1)
model_2.fit(train_X, train_Y2)
model_3.fit(train_X, train_Y3)

pred_Y1 = model_1.predict(test_X)
pred_Y2 = model_2.predict(test_X)
pred_Y3 = model_3.predict(test_X)

pred_Y = [mapping[x] for x in zip(pred_Y1, pred_Y2, pred_Y3)]

In [0]:
print("f1 score", f1_score(test_Y, pred_Y, average='macro'))
print("f1 score", f1_score(test_Y, pred_Y, average='micro'))


In [0]:
from sklearn.metrics import classification_report
print(classification_report(test_Y, pred_Y))