In [1]:
import numpy as np
import regex as re
import pandas as pd

import nltk
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))
from sklearn.metrics import confusion_matrix

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, Bidirectional, TimeDistributed, SpatialDropout1D
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kt973e\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kt973e\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kt973e\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#load data
comments = pd.read_csv('C:/Projects/NLP/Data/toxic-comment-classification/train.csv', encoding="utf8")

In [3]:
comments.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
filter = comments["comment_text"] != ""
comments = comments[filter]
comments = comments.dropna()

In [5]:
toxic_comments_labels = comments[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [6]:
#Clean Data
stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [7]:
comments['comment_text_cleaned'] = comments["comment_text"].apply(preprocess_text)

In [8]:
X = comments['comment_text_cleaned']

In [9]:
y = toxic_comments_labels.values

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [11]:
#convert text data into token vectors, 
vocabulary_size = 20000

tokenizer = Tokenizer(num_words = vocabulary_size)

#train data
tokenizer.fit_on_texts(x_train)

vocab_size = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(x_train)

#apply a padding method to add zeros and set the fixed size into each vector.
train_data = pad_sequences(train_sequences, maxlen=100)


#test data
tokenizer.fit_on_texts(x_test)
test_sequences = tokenizer.texts_to_sequences(x_test)

#apply a padding method to add zeros and set the fixed size into each vector.
test_data = pad_sequences(test_sequences, maxlen=100)

In [12]:
#embeddings
embeddings_index = dict()
f = open('C:/Projects/NLP/Home Work/Glove/glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [13]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [14]:
# define LSTM
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=100, weights=[embedding_matrix], trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(6, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          14593000  
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 100)         0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 128)              84480     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 6)                 3

In [15]:
model.fit(train_data, y_train, batch_size=256, epochs=3, verbose=1, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1ec75dbd220>

In [16]:
loss, accuracy = model.evaluate(test_data, y_test, verbose=1)

print("oss:", loss)
print("Accuracy:", accuracy)

oss: 0.14308710396289825
Accuracy: 0.9942660331726074


# Prediction

In [17]:
test_comments = pd.read_csv('C:/Projects/NLP/Data/toxic-comment-classification/test.csv', encoding="utf8")

In [18]:
filter = test_comments["comment_text"] != ""
test_comments = test_comments[filter]
test_comments = test_comments.dropna()

In [19]:
test_comments_clean = test_comments["comment_text"][:1].apply(preprocess_text)

In [20]:
tokenizer.fit_on_texts(test_comments_clean)
test_comments_sequences = tokenizer.texts_to_sequences(test_comments_clean)

#apply a padding method to add zeros and set the fixed size into each vector.
comment_ = pad_sequences(test_comments_sequences, maxlen=100)

In [21]:
#predict test data
comment_prediction = model.predict(comment_)

In [22]:
toxic_labels_ = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [23]:
type_of_comment = pd.DataFrame(comment_prediction, columns=toxic_labels_).T.rename_axis('Type of Comment').reset_index().rename(columns={0:'Percentage'})

In [24]:
type_of_comment

Unnamed: 0,Type of Comment,Percentage
0,toxic,0.019429
1,severe_toxic,4e-06
2,obscene,0.002406
3,threat,2e-06
4,insult,0.002653
5,identity_hate,2e-05
