# Imports

In [0]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, MaxPooling1D, Concatenate, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential

from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping

In [0]:
from google.colab import drive
drive.mount('/gdrive')

# Reading Train and Test Data

In [0]:
base_path = "/gdrive/My Drive/MS/AML/data/" #as per individual folder paths
# base_path = "C:/Users/ckaidab/Documents/AML/data/"
training_file = "train.csv"
testing_file = "test.csv"

glove_embeddings = "glove.6B.100d.txt"

test_labels = "test_labels.csv"

In [0]:
trainDf1 = pd.read_csv(base_path + training_file)
testDf1 = pd.read_csv(base_path + testing_file)

In [0]:
trainDf = trainDf1.head(trainDf1.shape[0]//2)
testDf = testDf1.head(10000)
trainDf.shape, testDf.shape, trainDf1.shape, testDf1.shape

In [0]:
trainDf.head()

# Flagging the comments as inappropriate

In [11]:
def isToxic(row):
    """Comments which are toxic, obscene, a threat, an insult and has identity hate are marked as inappropriate"""
    if row["toxic"] == 1 or row["severe_toxic"] == 1 or row["obscene"] == 1 or row["threat"] == 1 or row["insult"] ==1 or row["identity_hate"] == 1:
        return 1
    return 0
trainDf["inappropriate"] = trainDf.apply(lambda x: isToxic(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [0]:
columns = ["id", "comment_text", "inappropriate"]

In [0]:
trainDf = trainDf[columns]

In [14]:
trainDf.shape

(79785, 3)

# Tokenizing the text and converting to sequences

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainDf["comment_text"])
encoded = tokenizer.texts_to_sequences(trainDf["comment_text"])

In [0]:
avg_len = 0;
for i in range(0, len(encoded)):
    avg_len += len(encoded[i])

avg_len = avg_len // len(encoded)
print("Avg length of sequences is: {}".format(avg_len))
vocab_size = len(tokenizer.word_index) + 1

print("Vocab size is:{}".format(vocab_size))

In [0]:
padded_docs = pad_sequences(encoded, maxlen=avg_len, padding='post')
X_train, X_val, Y_train, Y_val = train_test_split( padded_docs, trainDf["inappropriate"], test_size=0.3, random_state=42)

In [0]:
X_train.shape, X_val.shape, Y_train.shape, Y_val.shape

# Getting Glove Embeddings

In [0]:
word_to_embedding = dict()
f = open(base_path + glove_embeddings, "rb")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_to_embedding[word] = coefs
f.close()

In [0]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = word_to_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [0]:
earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=1, mode='auto')

# CNN Baseline Model

In [0]:
def getCNN():
  input_seq = Input(shape=(avg_len,), dtype='int32')
  embedding = Embedding(vocab_size, 100, weights=[embedding_matrix], 
                            input_length=avg_len, trainable=True)(input_seq)

  conv = Conv1D(64, kernel_size=2, padding='same', activation='relu', strides=1)(embedding)
  pool = MaxPooling1D(pool_size=3)(conv)
  drop1 = Dropout(0.5)(pool)
  
  conv = Conv1D(256, kernel_size=4, padding='same', activation='relu', strides=1)(embedding)
  pool = MaxPooling1D(pool_size=3)(conv)
  drop2 = Dropout(0.5)(pool)

  merged = Concatenate()([drop1, drop2])
  flatten = Flatten()(merged)
  drop = Dropout(0.5)(flatten)
  outp = Dense(1, activation='sigmoid')(drop)

  cnnModel = Model(inputs=input_seq, outputs=outp)
  cnnModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return cnnModel

In [0]:
cnnModel = getCNN()
print(cnnModel.summary())

In [0]:
fittedModel_cnn = cnnModel.fit(X_train, Y_train, validation_split=0.2, epochs=10, batch_size=256, verbose=True, callbacks=[earlyStopping])

In [0]:
# Plot training & validation accuracy values
plt.plot(fittedModel_cnn.history['accuracy'])
plt.plot(fittedModel_cnn.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch') 
plt.legend(['Train', 'val'], loc='lower right')
plt.show()

# Plot training & validation loss values
plt.plot(fittedModel_cnn.history['loss'])
plt.plot(fittedModel_cnn.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper right')
plt.show()

# LSTM Model

In [0]:
def get_LSTM():
    input_seq = Input(shape=(avg_len,))
    embedding = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=avg_len, trainable=True)(input_seq)
    x = LSTM(64, return_sequences=True)(embedding)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(input_seq, x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [0]:
lstm_model = get_LSTM()
lstm_model.summary()

In [0]:
history_lstm = lstm_model.fit(X_train, Y_train, batch_size=512, epochs=10, verbose=1, validation_data=(X_val, Y_val))

In [0]:
# Plot training & validation accuracy values
plt.plot(history_lstm.history['accuracy'])
plt.plot(history_lstm.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch') 
plt.legend(['Train', 'val'], loc='lower right')
plt.show()

# Plot training & validation loss values
plt.plot(history_lstm.history['loss'])
plt.plot(history_lstm.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper right')
plt.show()

In [0]:
Y_test = pd.read_csv(base_path + test_labels)

encoded_test = tokenizer.texts_to_sequences(testDf["comment_text"])
padded_docs_test = pad_sequences(encoded_test, maxlen=avg_len, padding='post')

In [0]:
predictions = cnnModel.predict(padded_docs_test[:1000], verbose=1)

In [0]:
for i in range(len(predictions)):
    if predictions[i]>0.5:
        predictions[i] = 1
    else:
        predictions[i] = 0

In [0]:
from sklearn.metrics import confusion_matrix, classification_report

In [0]:
Y_test["inappropriate"] = Y_test.apply(lambda x: isToxic(x), axis=1)
Y_test = Y_test[["id", "inappropriate"]]

In [0]:
Y_test.head()

In [0]:
Y_test = [[int(i)] for i in Y_test["inappropriate"].tolist()[:1000]]

In [0]:
print(classification_report(Y_test, predictions))

In [0]:
pd.DataFrame(
   confusion_matrix(Y_test, predictions),
   index = [['Actual', 'Actual'], ['Appropriate', 'Inappropriate']],
   columns = [['Predicted', 'Predicted'], ['Appropiate', 'Inappropriate']])