# Data Prep

## Import packages

In [None]:
import nltk
nltk.download('stopwords')

dwlr = nltk.downloader.Downloader()
for pkg in dwlr.packages():
  if pkg.subdir == 'tokenizers':
    dwlr.download(pkg.id)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import numpy as np
import pandas as pd
import json
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Initialize Variables

In [None]:
BASE_DIR = '/content/drive/My Drive/Machine Learning/Data/Toxic comments/'
EMBEDDING_DIR = '/content/drive/My Drive/Machine Learning/Data/pre-trained embeddings/'
TRAIN_DATA = 'toxic_train.csv'
TEST_DATA = 'toxic_test.csv'
GLOVE_FILE = 'glove.6B.300d.txt'
TOKENIZER_FILE = 'toxic_tokenizer.pkl'
WORD_EMBEDDING_MATRIX = 'word_embedding_matrix.npy'
NB_WORDS_FILE = 'nb_words.json'
PROCESSED_TRAIN_DATA = 'train.npy'
PROCESSED_TEST_DATA = 'test.npy'
TARGET_DATA = 'target.npy'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = 300

## Download and clean data

In [None]:
# read data
train = pd.read_csv(BASE_DIR + TRAIN_DATA)

# check if there is any missing values
missing_values = train['comment_text'].isna().sum()

comments = train['comment_text']
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
y = train[labels].values

print(f"# of comments: {len(comments)}")
print(f"# of missing values: {missing_values}")

# of comments: 159571
# of missing values: 0


In [None]:
# functions to clean comments
def clean_comments(comment):
  tokens = word_tokenize(comment)
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [w for w in tokens if w.isalpha()]
  tokens = ' '.join(tokens)
  return tokens


def process_comments(comment):
  clean_c = []
  for c in comment:
    c = str(c)
    c = clean_comments(c)
    clean_c.append(c)
  return clean_c

comments_clean = process_comments(comments)

## Tokenize words and prepare data

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(comments_clean)

comments_word_sequences = tokenizer.texts_to_sequences(comments_clean)
comments_data = pad_sequences(comments_word_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')

word_index = tokenizer.word_index
print(f"# of words in index: {len(word_index)}")
print(f"max length of a sequence: {max([len(s.split()) for s in comments_clean])}")

# of words in index: 157448
max length of a sequence: 1383


## Process embedding

In [None]:
# define functions to process embedding
nb_words = min(MAX_NB_WORDS, len(word_index))

def load_embedding(filepath):
  file = open(filepath, 'r', encoding = 'utf-8')
  embeddings = {}
  for line in file:
    values = line.split(' ')
    word = values[0]
    vec = np.asarray(values[1:], dtype = 'float32')
    embeddings[word] = vec
  return embeddings


def get_weight_matrix(embedding):
  weight_matrix = np.zeros((nb_words+1, EMBEDDING_DIM))
  for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
      continue
    embedding_vector = embedding.get(word)
    if embedding_vector is not None: # there may be words in our data that's missing from embedding
      weight_matrix[i] = embedding_vector
  return weight_matrix

raw_embedding = load_embedding(EMBEDDING_DIR + GLOVE_FILE)
word_embedding_matrix = get_weight_matrix(raw_embedding)

print(f"embedding shape: {word_embedding_matrix.shape}")

embedding shape: (157449, 300)


## Process test data

In [None]:
# test data
test = pd.read_csv(BASE_DIR + TEST_DATA)

# check missing values
test_missing_values = test['comment_text'].isnull().sum()

test_comments = test['comment_text']

print(f"# of test comments: {len(test_comments)}")
print(f"# missing values: {test_missing_values}")

# clean comments
test_clean = process_comments(test_comments)
# tokenize test data
test_word_sequences = tokenizer.texts_to_sequences(test_clean)
test_data = pad_sequences(test_word_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')

# of test comments: 153164
# missing vallues: 0


## Save files

In [None]:
np.save(open(BASE_DIR + PROCESSED_TRAIN_DATA, 'wb'), comments_data)
np.save(open(BASE_DIR + WORD_EMBEDDING_MATRIX, 'wb'), word_embedding_matrix)
np.save(open(BASE_DIR + TARGET_DATA, 'wb'), y)
np.save(open(BASE_DIR + PROCESSED_TEST_DATA, 'wb'), test_data)
joblib.dump(tokenizer, BASE_DIR + TOKENIZER_FILE)
with open(BASE_DIR + NB_WORDS_FILE, 'w') as f:
  json.dump({'nb_words':nb_words}, f)

# Build Model

## Import packages

In [None]:
from keras.models import Model, load_model
from keras.layers import Input, Dense, Dropout, BatchNormalization, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import time
import keras.backend as K

## Initialize variables

In [None]:
MODEL_PATH = 'lstm_toxic_comment_model.h5'
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
EPOCHS = 15
DROPOUT = 0.2
BATCH_SIZE = 32

## Load data

In [None]:
comments = np.load(open(BASE_DIR + PROCESSED_TRAIN_DATA, 'rb'))
target = np.load(open(BASE_DIR + TARGET_DATA, 'rb'))
test = np.load(open(BASE_DIR + PROCESSED_TEST_DATA, 'rb'))
word_embedding_matrix = np.load(open(BASE_DIR + WORD_EMBEDDING_MATRIX, 'rb'))
with open(BASE_DIR + NB_WORDS_FILE, 'r') as f:
  nb_words = json.load(f)['nb_words']

## Build model

In [None]:
K.clear_session()

input = Input(shape = (MAX_SEQUENCE_LENGTH,))

x = Embedding(nb_words+1,
              EMBEDDING_DIM,
              input_length = MAX_SEQUENCE_LENGTH,
              weights = [word_embedding_matrix],
              trainable = False)(input)
x = Bidirectional(LSTM(128, return_sequences = False))(x)
x = Dense(16,activation = 'relu')(x)
x = Dropout(DROPOUT)(x)
x = BatchNormalization()(x)
x = Dense(6, activation = 'sigmoid')(x)

model = Model(inputs = input, outputs = x)
opt = Adam(lr = 0.001)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 300)          47234700  
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               439296    
_________________________________________________________________
dense (Dense)                (None, 16)                4112      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
batch_normalization (BatchNo (None, 16)                64        
_________________________________________________________________
dense_1 (Dense)              (None, 6)                

In [None]:
t0 = time.time()
es = EarlyStopping(monitor = 'val_loss', patience = 3, mode = 'auto', verbose=0)
cp = ModelCheckpoint(BASE_DIR + MODEL_PATH, monitor = 'val_loss', verbose = 0, save_best_only = True)

history = model.fit(comments,
                    target, 
                    verbose = 2,
                    batch_size = BATCH_SIZE,
                    epochs = EPOCHS,
                    callbacks = [es, cp],
                    validation_split = VALIDATION_SPLIT)
t1 = time.time()

print(f"Total training time elapsed: {np.round(((t1-t0)/60),2)} minutes")

Epoch 1/15
3990/3990 - 73s - loss: 0.1081 - accuracy: 0.9140 - val_loss: 0.0565 - val_accuracy: 0.9940
Epoch 2/15
3990/3990 - 70s - loss: 0.0648 - accuracy: 0.9941 - val_loss: 0.0566 - val_accuracy: 0.9941
Epoch 3/15
3990/3990 - 72s - loss: 0.0590 - accuracy: 0.9939 - val_loss: 0.0537 - val_accuracy: 0.9940
Epoch 4/15
3990/3990 - 70s - loss: 0.0557 - accuracy: 0.9921 - val_loss: 0.0540 - val_accuracy: 0.9935
Epoch 5/15
3990/3990 - 70s - loss: 0.0529 - accuracy: 0.9815 - val_loss: 0.0540 - val_accuracy: 0.9923
Epoch 6/15
3990/3990 - 70s - loss: 0.0504 - accuracy: 0.9375 - val_loss: 0.0541 - val_accuracy: 0.9757
Total training time elapsed: 7.11 minutes
