Code from: https://github.com/lazyprogrammer/machine_learning_examples/blob/e3b4d0bcd14c99e6b938d578802bb919018cd298/nlp_class3/cnn_toxic.py


CNNs = convolutional neural netowrk

In [1]:
# https://deeplearningcourses.com/c/deep-learning-advanced-nlp
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

import zipfile
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences # to have rectangular output
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score # area under curve for binary classificaiton

In [2]:
# some configuration
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100 # size of each word vector
VALIDATION_SPLIT = 0.2 
BATCH_SIZE = 128
EPOCHS = 10

# Data
Kaggle Toxic Comment Classification Challenge
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data


In [3]:
DATA_DIR = "../data"

# # 1. Kaggle api installation: 
# # https://github.com/Kaggle/kaggle-api

# # 2. Download the data
# if not os.path.exists(f"{DATA_DIR}/jigsaw-toxic-comment-classification-challenge.zip"):
#     os.system(f'cd {DATA_DIR}; kaggle competitions download -c jigsaw-toxic-comment-classification-challenge')

# # 3. unzip the data
# path_to_zip_file = f"{DATA_DIR}/jigsaw-toxic-comment-classification-challenge.zip"
# directory_to_extract_to = f'{DATA_DIR}/toxic-comment/'
# with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
#     zip_ref.extractall(directory_to_extract_to)

# # 4. list the files in the directory and unzip them
# file_list = os.listdir(f"{DATA_DIR}/toxic-comment/")
# for file in file_list:
#     path_to_zip_file = (f"{DATA_DIR}/toxic-comment/{file}")
#     with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
#         zip_ref.extractall(directory_to_extract_to)

# # 5. remove the zip file
# os.system(f"rm {DATA_DIR}/jigsaw-toxic-comment-classification-challenge.zip")
# os.system(f"rm {DATA_DIR}/toxic-comment/*.zip")


In [4]:
# # get glove embeddings

# # 1. dowload the embeddings
# if not os.path.exists(f"{DATA_DIR}/glove.6B.zip"):
#     os.system(f'wget -N -P {DATA_DIR} https://nlp.stanford.edu/data/glove.6B.zip')

# # 2. unzip the embeddings
# if not os.path.exists(f"{DATA_DIR}/glove.6B"):
#     path_to_zip_file = f"{DATA_DIR}/glove.6B.zip"
#     directory_to_extract_to = f"{DATA_DIR}/glove.6B/"
#     with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
#         zip_ref.extractall(directory_to_extract_to)

# # 3. remove the zip file
# os.system(f"rm {DATA_DIR}/glove.6B.zip")

# Load pre-trained word vectors

In [5]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join(f'{DATA_DIR}/glove.6B/glove.6B.{EMBEDDING_DIM}d.txt')) as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [6]:
# prepare text samples and their labels
print('Loading in comments...')

train = pd.read_csv(f"{DATA_DIR}/toxic-comment/train.csv")
sentences = train["comment_text"].fillna("DUMMY_VALUE").values # there is actually no empty comment_text value
possible_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[possible_labels].values

Loading in comments...


In [7]:
targets

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [8]:
# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
# print("sequences:", sequences); exit()

In [9]:
print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))
s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])

print("max word index:", max(max(seq) for seq in sequences if len(seq) > 0))

max sequence length: 1400
min sequence length: 0
median sequence length: 35
max word index: 19999


In [10]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 210337 unique tokens.


In [11]:
#  pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Shape of data tensor: (159571, 100)


In [12]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1) # becuase of 0 index for padding
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word) # if we used [] it would have gave us exception
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


N: No. of samples

T: Sequence Length

data (the comments) -- N x T 

tagets -- N x 6

embedding -- V x D (V: Vocabulary size, D: embedding dimension)

# Build the model

In [13]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False # we don't want these waits to update during the experiment
)

In [14]:
print('Building model...')

# train a 1D convnet with global maxpooling
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x) # we have a time series and we don't care how long it is, we just take the maximum value in each dimension
# input is T x M (M: number of features)  --> global max pool --> M (output size)
# Which point in time was most importatnt for computing the output
x = Dense(128, activation='relu')(x)
output = Dense(len(possible_labels), activation='sigmoid')(x)

model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer='rmsprop',
  metrics=['accuracy']
)

Building model...


In [15]:
print('Training model...')
r = model.fit(
  data,
  targets,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)

Training model...
Epoch 1/10


UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node model/conv1d/conv1d (defined at <ipython-input-15-90ebc4d2e692>:7) ]] [Op:__inference_train_function_1300]

Function call stack:
train_function


In [None]:
# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
#  accuracies
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()


In [None]:
# plot the mean AUC over each label
p = model.predict(data)
aucs = []
for j in range(6):
    auc = roc_auc_score(targets[:,j], p[:,j])
    aucs.append(auc)
print(np.mean(aucs))