In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from sklearn.metrics import classification_report


In [None]:
start = time.time()

# 6 class-labels
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


# reading the data
data = pd.read_csv('dataset.csv')


# assigning comment and id to X
cols = [0,1]
X = data[data.columns[cols]]


# assigning class-labels to Y
cols1 = [2,3,4,5,6,7]
Y = data[data.columns[cols1]]


# splitting the data

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25)

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.10)


In [None]:
# taking only comment from X

train_comment = X_train["comment_text"]
test_comment = X_test["comment_text"]

In [None]:
max_features = 20000

# using tokenizer
tokenizer = Tokenizer(num_words = max_features)

# fit and transform
tokenizer.fit_on_texts(list(train_comment))

train_tokenized = tokenizer.texts_to_sequences(train_comment)
test_tokenized = tokenizer.texts_to_sequences(test_comment)


In [None]:
# more than 97% of comments have a max length of 200 words, thus padding all comments for this length
max_length = 200

X_train_new = pad_sequences(train_tokenized, maxlen = max_length)
X_test_new = pad_sequences(test_tokenized, maxlen = max_length)

In [None]:
# building lstm model using different layers

def my_classifier():
    inp = Input(shape = (max_length, ))
    
    embed_size = 128
    # max_features = 20000
    
    # embedding layer to convert 2D input to 3D
    x = Embedding(max_features, embed_size)(inp)
    
    # lstm layer
    x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
    
    # maxpool layer
    x = GlobalMaxPool1D()(x)
    
    # dropout layer with rate=0.25
    x = Dropout(.25)(x)
    
    # dense layer with sigmoid activation function
    x = Dense(6, activation="sigmoid")(x)
    
    # generating the model
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

# calling the function to generate the neural network
model = my_classifier()

model.fit(X_train_new, y_train.values, batch_size=32, epochs=5)
y_pred = model.predict(X_test_new)


In [None]:
model.summary()

In [None]:
# generating binary values based on a threshold value

# the threshold value was obtained by elbow method

y_pred1=[[None for i in range(6)] for j in range(len(y_pred))] 

for i in range(len(y_pred)):
    for j in range(6):
        if y_pred[i][j]<0.527:
            y_pred1[i][j]=0
        else:
            y_pred1[i][j]=1


In [None]:
# 6 class-labels
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print(classification_report(y_test.values,y_pred1,target_names=label_cols))

In [None]:
# calculating overall accuracy by checking the predicted output against the given output

a=y_test.values.tolist()
b=y_pred1
count=0
for i in range(len(y_test)):
    if a[i] == b[i]:
        count+=1
print("Accuracy: ",count/len(y_test)*100)


In [None]:
end = time.time()

print("Time: ",(end-start))
