# Download data using the kaggle api

In [6]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,  SimpleRNN
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
import os

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [1]:
!pip install kaggle



In [2]:
# Upload your kaggle api key
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 66 bytes


In [22]:
!rm -r data/
!mkdir data/

In [24]:
!kaggle datasets download -d rtatman/glove-global-vectors-for-word-representation

Downloading glove-global-vectors-for-word-representation.zip to /content
 96% 439M/458M [00:02<00:00, 204MB/s]
100% 458M/458M [00:02<00:00, 180MB/s]


In [25]:
for dirname, _, filenames in os.walk('./'):
  for filename in filenames:
    if '.zip' in filename:
      !unzip $filename
      !rm $filename

Archive:  glove-global-vectors-for-word-representation.zip
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.50d.txt        


In [26]:
!mv glove* data/

In [27]:
%%capture
!kaggle competitions download jigsaw-multilingual-toxic-comment-classification

In [28]:
for dirname, _, filenames in os.walk('./'):
  for filename in filenames:
    if '.zip' in filename:
      data_file = filename.strip(".zip")
      !unzip $filename
      !mv $data_file data/
      !rm $filename
      print(f"\n{filename} unziped... {data_file} extracted and moved ... {filename} removed\n")

Archive:  test.csv.zip
  inflating: test.csv                

test.csv.zip unziped... test.csv extracted and moved ... test.csv.zip removed

Archive:  jigsaw-toxic-comment-train.csv.zip
  inflating: jigsaw-toxic-comment-train.csv  

jigsaw-toxic-comment-train.csv.zip unziped... jigsaw-toxic-comment-train.csv extracted and moved ... jigsaw-toxic-comment-train.csv.zip removed

Archive:  jigsaw-toxic-comment-train-processed-seqlen128.csv.zip
  inflating: jigsaw-toxic-comment-train-processed-seqlen128.csv  

jigsaw-toxic-comment-train-processed-seqlen128.csv.zip unziped... jigsaw-toxic-comment-train-processed-seqlen128.csv extracted and moved ... jigsaw-toxic-comment-train-processed-seqlen128.csv.zip removed

Archive:  jigsaw-unintended-bias-train.csv.zip
  inflating: jigsaw-unintended-bias-train.csv  

jigsaw-unintended-bias-train.csv.zip unziped... jigsaw-unintended-bias-train.csv extracted and moved ... jigsaw-unintended-bias-train.csv.zip removed

Archive:  jigsaw-unintended-bias-train

In [29]:
i = 0
for dirname, _, filenames in os.walk('./data'):
  for filename in filenames:
    i+=1
    print(os.path.join(dirname, filename))
print(i, "files")

./data/sample_submission.csv
./data/validation.csv
./data/validation-processed-seqlen128.csv
./data/test-processed-seqlen128.csv
./data/jigsaw-toxic-comment-train.csv
./data/test.csv
./data/jigsaw-unintended-bias-train-processed-seqlen128.csv
./data/glove.6B.100d.txt
./data/jigsaw-unintended-bias-train.csv
./data/jigsaw-toxic-comment-train-processed-seqlen128.csv
./data/glove.6B.200d.txt
./data/glove.6B.50d.txt
12 files


# Recurrent Neural Network Notebook

Credit to [tanulsingh077 on kaggle
](https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert/notebook)


# Configure Hardware





In [74]:
# Detect and use hardware
def hardware_strategy(use_tpu=False):
  if use_tpu:
    try:
      tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
      print(f"Running on TPU: {tpu.master()}")
    except ValueError:
      tpu = None
      print("Error: No TPU available")

    if tpu:
      tf.config.experimental_connect_to_cluster(tpu)
      tf.tpu.experimental.initialize_tpu_system(tpu)
      print("Using TPU")
      return tf.distribute.experimental.TPUStrategy(tpu)
    # Default, works on cpu and gpu
  print("Using CPU")
  return tf.distribute.get_strategy()


# Prepare Data

In [31]:
# retreive data
train = pd.read_csv('./data/jigsaw-toxic-comment-train.csv')
test = pd.read_csv('./data/test.csv')
validation = pd.read_csv('./data/validation.csv')

We will be classifying these as good/bad, so we can drop the extra columns for now

In [32]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'], axis=1, inplace=True)

Use a subset of the data to train faster

In [33]:
train = train.loc[0:12000]
train.shape
train.head()

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


Find the size of the largest comment (for padding later)

In [34]:
largest_comment = train["comment_text"].apply(lambda x: len(str(x).split())).max()
largest_comment

1403

Train test split

In [35]:
x_train, x_validation, y_train, y_validation = train_test_split(train.comment_text.values, 
                                                                train.toxic.values, 
                                                                stratify=train.toxic.values,
                                                                random_state=42,
                                                                test_size=0.2,
                                                                shuffle=True)

In [36]:
"""
The Receiver Operator Characteristic (ROC) curve is an evaluation metric for binary classification problems. 
It is a probability curve that plots the TPR against FPR at various threshold values and essentially separates 
the ‘signal’ from the ‘noise’. 
The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes 
and is used as a summary of the ROC curve.

graphs false positive rate (x) vs true positive rate (y)
mainly used for binary classification problems
"""
def roc_auc(predictions, target):
  fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
  return metrics.auc(fpr, tpr)

# Simple RNN

Tokenize the data

In [37]:
token = text.Tokenizer()
max_len = 150
# update internal vocab
token.fit_on_texts(list(x_train) + list(x_validation))
# tokenize the train and validation set
x_train_seq = token.texts_to_sequences(x_train)
x_validation_seq = token.texts_to_sequences(x_validation)

# zero pad the sequences
x_train_pad = sequence.pad_sequences(x_train_seq, maxlen=max_len)
x_validation_pad = sequence.pad_sequences(x_validation_seq, maxlen=max_len)

word_index = token.word_index

In [38]:
x_train[0]

'"\n\n Guess who? \n\nI have dark wings, a dark/purplish dress, and I\'m from Rozen Maiden. can you guess who it is?   \nSuigintou? \'\'\'\'\'\' Talk/Cont "'

In [39]:
x_train_seq[0]

[664,
 65,
 7,
 19,
 2262,
 14102,
 5,
 2262,
 20439,
 6071,
 4,
 71,
 32,
 20440,
 6620,
 39,
 6,
 664,
 65,
 11,
 8,
 20441,
 1502,
 38,
 6072]

In [40]:
list(word_index.items())[:5]

[('the', 1), ('to', 2), ('of', 3), ('and', 4), ('a', 5)]

In [41]:
list(word_index.items())[-5:]

[('publicise', 43492),
 ('gables', 43493),
 ('plagarize', 43494),
 ('tibor', 43495),
 ('unaccurate', 43496)]

In [42]:
%%time
strategy = hardware_strategy()
with strategy.scope():
  # A simple RNN without any pretrained embeddings and one dense layer
  model = Sequential(name='simple_rnn')
  model.add(Embedding(len(word_index)+1, 300, input_length=max_len))
  model.add(SimpleRNN(100))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Using CPU
Model: "simple_rnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 300)          13049100  
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 100)               40100     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: user 329 ms, sys: 166 ms, total: 495 ms
Wall time: 767 ms


In [43]:
%%time
model.fit(x_train_pad, y_train, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 5min 39s, sys: 12.6 s, total: 5min 52s
Wall time: 3min 13s


<tensorflow.python.keras.callbacks.History at 0x7fba1e177610>

In [44]:
scores = model.predict(x_validation_pad)
print("Auc: %.2f%%" % (roc_auc(scores,y_validation)))

Auc: 0.86%


In [46]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,y_validation)})

# LSTM

In [47]:
# download word embeddings
embeddings_index = {}
with open ('./data/glove.6B.200d.txt', 'r', encoding='utf-8') as f:
  for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    embeddings = np.asarray([float(x) for x in values[1:]])
    embeddings_index[word] = embeddings

print(f"Found {len(embeddings_index)} word vectors")

400000it [00:29, 13599.26it/s]

Found 400000 word vectors





In [48]:
# create embedding matrix
embedding_matrix = np.zeros((len(word_index)+1, 200)) #200 dimensional word embeddings
for word, i in tqdm(word_index.items()):
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

100%|██████████| 43496/43496 [00:00<00:00, 329549.16it/s]


In [49]:
%%time
strategy = hardware_strategy()
with strategy.scope():
  # A simple LSTM with glove embeddings and one dense layer
  model = Sequential(name='lstm_rnn')
  model.add(Embedding(len(word_index)+1,
                      200,
                      weights=[embedding_matrix],
                      input_length=max_len,
                      trainable=False))
  model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Using CPU
Model: "lstm_rnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 200)          8699400   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               120400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 8,819,901
Trainable params: 120,501
Non-trainable params: 8,699,400
_________________________________________________________________
CPU times: user 311 ms, sys: 230 ms, total: 542 ms
Wall time: 382 ms


In [50]:
model.fit(x_train_pad, y_train, epochs=5, batch_size=64 * strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fba137c3410>

In [51]:
scores = model.predict(x_validation_pad)
print("Auc: %.2f%%" % (roc_auc(scores,y_validation)))

Auc: 0.97%


In [52]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,y_validation)})


# GRU

In [54]:
%%time
strategy = hardware_strategy()
with strategy.scope():
  # GRU with glove embeddings and two dense layers
  model = Sequential(name="gru_rnn")
  model.add(Embedding(len(word_index)+1,
                      200,
                      weights=[embedding_matrix],
                      input_length=max_len,
                      trainable=False))
  model.add(SpatialDropout1D(0.3))
  model.add(GRU(200))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.summary()

Using CPU
Model: "gru_rnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 150, 200)          8699400   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 150, 200)          0         
_________________________________________________________________
gru_1 (GRU)                  (None, 200)               241200    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 201       
Total params: 8,940,801
Trainable params: 241,401
Non-trainable params: 8,699,400
_________________________________________________________________
CPU times: user 418 ms, sys: 62 ms, total: 480 ms
Wall time: 405 ms


In [56]:
model.fit(x_train_pad, y_train, epochs=5, batch_size=64 * strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fba1230cad0>

In [57]:
scores = model.predict(x_validation_pad)
print("Auc: %.2f%%" % (roc_auc(scores,y_validation)))

Auc: 0.97%


In [66]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,y_validation)})

Compare the ROC AUC scores between models

In [67]:
scores_model

[{'AUC_Score': 0.8592577882787772, 'Model': 'SimpleRNN'},
 {'AUC_Score': 0.9687334092539382, 'Model': 'LSTM'},
 {'AUC_Score': 0.9697384791832996, 'Model': 'GRU'}]

# Bi-Directional RNN

In [69]:
%%time
strategy = hardware_strategy()
with strategy.scope():
  model=Sequential(name="bidirectional")
  model.add(Embedding(len(word_index)+1,
                      200,
                      weights=[embedding_matrix],
                      input_length=max_len,
                      trainable=False))
  model.add(Bidirectional(LSTM(200, dropout=0.3, recurrent_dropout=0.3)))
  model.add(Dense(1,activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.summary()

Using CPU
Model: "bidirectional"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 150, 200)          8699400   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400)               641600    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 401       
Total params: 9,341,401
Trainable params: 642,001
Non-trainable params: 8,699,400
_________________________________________________________________
CPU times: user 482 ms, sys: 91.2 ms, total: 573 ms
Wall time: 488 ms


In [70]:
model.fit(x_train_pad, y_train, epochs=5, batch_size=64 * strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fba10b7e890>

In [71]:
scores = model.predict(x_validation_pad)
print("Auc: %.2f%%" % (roc_auc(scores,y_validation)))

Auc: 0.97%


In [72]:
scores_model.append({'Model': 'Bi-Directional LSTM','AUC_Score': roc_auc(scores,y_validation)})

In [73]:
scores_model

[{'AUC_Score': 0.8592577882787772, 'Model': 'SimpleRNN'},
 {'AUC_Score': 0.9687334092539382, 'Model': 'LSTM'},
 {'AUC_Score': 0.9697384791832996, 'Model': 'GRU'},
 {'AUC_Score': 0.9705733356568822, 'Model': 'Bi-Directional LSTM'}]

# Seq2Seq Models

# Attention Models

# Transformers