# Toxic Comments

In [41]:
import keras
import numpy as np
import pandas as pd
from keras.layers import *
from keras.optimizers import * 
from keras.models import *
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split

In [2]:
path = '../ToxicComments/'
train = pd.read_csv(path + 'toxic_train.csv')
test = pd.read_csv(path + 'toxic_test.csv')

In [29]:
x_train = train['comment_text'].str.lower()
y_train = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

x_test = test['comment_text'].str.lower()
num_classes =6

## We will use GloVe embeddings first

#### Have to get this ready - https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html i got the code from here

In [4]:
vocabulary=100000

maxlen=150
embedding_size=300 #int(100000**0.25)+1 if its not pretrained
#embedding vector dimension should be the 4th root of the vocabulary according to Google
x_train_original = x_train
y_train_original = y_train
x_test_original = x_test
token = text.Tokenizer(num_words=vocabulary,lower=True)
token.fit_on_texts(list(x_train)+list(x_test))
x_train = token.texts_to_sequences(x_train)
x_test=token.texts_to_sequences(x_test)
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)

In [5]:
embeddings_index = {}
with open(path+'glove.840B.300d.txt',encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [6]:
word_index = token.word_index
num_words = min(vocabulary, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_size))
for word, i in word_index.items():
    if i >= vocabulary:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [10]:
x_train_original_split, x_val_original_split, y_train_original_split,y_val_original_split=train_test_split(x_train, 
                                       y_train, 
                                       train_size=0.9, 
                                       random_state=3)



## Simple LSTM

In [20]:
model = Sequential()
model.add(Embedding(vocabulary,
                    embedding_size,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(LSTM(256,return_sequences=True,dropout=0.5))
model.add(Flatten())
model.add(Dense(num_classes,activation='sigmoid'))
model.compile(loss='binary_crossentropy',
             optimizer=SGD(lr=1e-3),#Adam, Adagrad 
             metrics = ['accuracy'])
#binary - because we have multilabeled samples


In [9]:
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(x_train, 
                                                                          y_train, 
                                                                          train_size=0.9, 
                                                                          random_state=3)



In [23]:
model.fit(x_train_split, y_train_split, validation_data=(x_val_split, y_val_split),
          epochs=2, batch_size=128)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x9628a64518>

## LSTM with ConvLayer

In [26]:
model = Sequential()
model.add(Embedding(vocabulary,
                    embedding_size,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(LSTM(256,return_sequences=True,dropout=0.5))
model.add(Conv1D(128,5,activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Flatten())
model.add(Dense(num_classes,activation='sigmoid'))
model.compile(loss='binary_crossentropy',
             optimizer=SGD(lr=1e-3),#Adam, Adagrad 
             metrics = ['accuracy'])

In [27]:
model.fit(x_train_split, y_train_split, validation_data=(x_val_split, y_val_split),
          epochs=2, batch_size=128)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x9695cf9588>

## MLP(multilayer perceptron)

In [11]:
model = Sequential()

model.add(Dense(64, input_dim=maxlen, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train_original_split, y_train_original_split,
          epochs=20,
          batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x3d36ba8c18>

## Stacked LSTM

In [56]:
model = Sequential()
model.add(Embedding(vocabulary,
                    embedding_size,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(LSTM(32, return_sequences=True,))  
model.add(LSTM(32, return_sequences=True))  
model.add(LSTM(32)) 
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=Adagrad(lr=1e-3),
              metrics=['accuracy'])

In [57]:
model.fit(x_train_split, y_train_split,
          epochs=2,
          batch_size=128)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x96f734dba8>

In [59]:
model = Sequential()
model.add(Embedding(vocabulary,
                    embedding_size,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(Conv1D(128,5,activation='relu'))
model.add(Conv1D(128,5,activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(Conv1D(256,5,activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=Adagrad(lr=1e-3),
              metrics=['accuracy'])

In [63]:
model.fit(x_train_split, y_train_split,
          validation_data=(x_val_split,y_val_split),
          epochs=10,
          batch_size=128)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x9754333898>

### Without pretrained embeddings

In [66]:
embedding_size_normal = int(vocabulary**0.25) +1
model = Sequential()
model.add(Embedding(vocabulary,
                    embedding_size_normal,
                    #weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(Conv1D(128,5,activation='relu'))
model.add(Conv1D(128,5,activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(Conv1D(256,5,activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=Adagrad(lr=1e-3),
              metrics=['accuracy'])

In [67]:
model.fit(x_train_split, y_train_split,
          validation_data=(x_val_split,y_val_split),
          epochs=10,
          batch_size=128)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x9764c4bf60>

## LSTM - Basic Attention

In [62]:
embedding_size_normal = int(vocabulary**0.25)+1
model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary,
                    embedding_size_normal,
                    #weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model_lstm.add(LSTM(128,return_sequences=True))
model_lstm.summary()
model_attention=model_lstm
model_attention.add(TimeDistributed(Dense(1)))
model_attention.add(Flatten())
model_attention.add(Activation('sigmoid'))
model_attention.add(RepeatVector(128))
model_attention.add(Permute((2,1)))

#model_merged.add(Concatenate([model_lstm,model_attention]))#Merge('concat') is now Concatenate([])
model_merged = merge([model_attention,model_lstm],mode='mul')
model_merged.add(Flatten())
model_merged.add(Dense(num_classes,activation='sigmoid'))

model_merged.compile(loss='binary_crossentropy',
                    optimizer=Adam(lr=1e-3),
                    metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_36 (Embedding)     (None, 150, 18)           1800000   
_________________________________________________________________
lstm_36 (LSTM)               (None, 150, 128)          75264     
Total params: 1,875,264
Trainable params: 75,264
Non-trainable params: 1,800,000
_________________________________________________________________


TypeError: 'module' object is not callable

In [None]:
model.fit(x_train_split, y_train_split,
          validation_data=(x_val_split,y_val_split),
          epochs=10,
          batch_size=128)

In [None]:
|