In [None]:
from google.colab import drive
drive.mount('/gdrive')

root = '/gdrive/My Drive/ml_project'

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
tqdm.pandas()

Mounted at /gdrive


  from pandas import Panel


Import training data, test data

In [None]:
train_df = pd.read_csv(root + "/input/train.csv")
test_df = pd.read_csv(root + "/input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


Load and improve embedding

In [None]:
import zipfile
from gensim.models import KeyedVectors
import numpy as np

embeddings_path = root + "/input/embeddings/embeddings.zip"
glove = 'glove.840B.300d/glove.840B.300d.txt'
wiki = 'wiki-news-300d-1M/wiki-news-300d-1M.vec'
google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
para = 'paragram_300_sl999/paragram_300_sl999.txt'


def load_embedding(embedding_name):

    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

    with zipfile.ZipFile(embeddings_path) as embeddings_zip:
        print("Found embeddings as a zip file")

        if embedding_name == google:
            return KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

        else:
            embedding = []
            for o in embeddings_zip.open(embedding_name):
                try:
                    if len(o.decode('utf-8')) > 100:
                        embedding.append(get_coefs(*o.decode('utf-8').split(" ")))
                except:
                    pass
        
        return dict(embedding)

In [None]:
n_splits = 5
seed = 31
puncts = '´‘’“”…!#$%&()*+,-./:;<=>?@[\]^_`{|}~"' + "'"

In [None]:
from keras.preprocessing.text import Tokenizer

to_exclude = ''
to_tokenize = puncts

tokenizer = Tokenizer(filters=to_exclude, lower=False)

Split the data into train, validation, test

In [None]:
from sklearn.model_selection import train_test_split, KFold

kfold = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

train_df, val_df = train_test_split(train_df, test_size=0.1)

train_idx, val_idx = list(kfold.split(train_df))[0]
train_df, val_df = train_df.iloc[train_idx], train_df.iloc[val_idx]

train_text = train_df["question_text"].fillna("_na_")
val_text = val_df["question_text"].fillna("_na_")
test_text = test_df["question_text"].fillna("_na_")

train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
import re

train_text = train_text.progress_apply(lambda x: re.sub(r'(['+to_tokenize+'])', r' \1', x)).values
val_text = val_text.progress_apply(lambda x: re.sub(r'(['+to_tokenize+'])', r' \1', x)).values
test_text = test_text.progress_apply(lambda x: re.sub(r'(['+to_tokenize+'])', r' \1', x)).values

HBox(children=(FloatProgress(value=0.0, max=940407.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=235102.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=375806.0), HTML(value='')))




In [None]:
text = list(train_text) + list(val_text) + list(test_text)

Tokenize the text

In [None]:
%%time
tokenizer.fit_on_texts(text)

CPU times: user 19.5 s, sys: 128 ms, total: 19.7 s
Wall time: 19.7 s


Encoding the word to integer

In [None]:
word_index = tokenizer.word_index

Encoding the text to interger sequence

In [None]:
%%time
train_X = tokenizer.texts_to_sequences(train_text)
val_X = tokenizer.texts_to_sequences(val_text)
test_X = tokenizer.texts_to_sequences(test_text)

maxlen = max({len(seq) for seq in train_X} | {len(seq) for seq in val_X} | {len(seq) for seq in test_X})

CPU times: user 16.3 s, sys: 244 ms, total: 16.6 s
Wall time: 16.6 s


Pad the sequence to equalize the length

In [None]:
from keras.preprocessing.sequence import pad_sequences

train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

In [None]:
glove_emb_mean, glove_emb_std = -0.005838499, 0.48782197

In [None]:
embed_size = 300 # how big is each word vector
max_features = len(word_index) # how many unique words to use (i.e num rows in embedding vector)

Load glove embedding

In [None]:
%%time
glove_embedding = load_embedding(glove)

Found embeddings as a zip file
CPU times: user 3min 21s, sys: 5.05 s, total: 3min 26s
Wall time: 3min 30s


Improve the embedding matrix

In [None]:
import nltk
nltk.download('wordnet')

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
s = PorterStemmer()
l = LancasterStemmer()
n = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def is_in_emb(word, i, embedding, emb_matrix):

    if word in embedding:
        emb_matrix[i] = embedding[word]
        return True
    
    tmp = word.lower()
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    tmp = word.upper()
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True

    tmp = word.capitalize()
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True

    tmp = s.stem(word)
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    tmp = l.stem(word)
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    tmp = n.lemmatize(word)
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    return False

In [None]:
glove_emb_matrix = np.random.normal(glove_emb_mean, glove_emb_std, (max_features+1, embed_size))
glove_oov = {}

for word, i in tqdm(word_index.items()):
    i -= 1
    assert i >= 0

    if is_in_emb(word, i, glove_embedding, glove_emb_matrix): continue

    tmp = word
    for punct in puncts:
        tmp = tmp.replace(punct, '')
    if is_in_emb(tmp, i, glove_embedding, glove_emb_matrix): continue

    for num in '0123456789':
        tmp = tmp.replace(num, '')
    if is_in_emb(tmp, i, glove_embedding, glove_emb_matrix): continue

    else:
        glove_oov[word] = i

print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / max_features * 100))

HBox(children=(FloatProgress(value=0.0, max=330221.0), HTML(value='')))


percentage of oov of glove: 14.94%


In [None]:
glove_emb_matrix.shape

(330222, 300)

Use 1D CNN

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
filter_sizes=[2,3,5]
num_filters = 512
drop = 0.5

In [None]:
model_input = Input(shape=(maxlen,))
z = Embedding(len(word_index)+1,
              glove_emb_matrix.shape[1],
              weights=[glove_emb_matrix],
              input_length=maxlen,
              trainable=False)(model_input)

conv_blocks = []

for sz in filter_sizes:
  conv = Conv1D(filters = num_filters,
                kernel_size = sz,
                padding = "valid",
                activation = "relu",
                strides = 1)(z)
  conv = GlobalMaxPooling1D()(conv)
  conv = Flatten()(conv)
  conv_blocks.append(conv)

z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(drop)(z)
model_output = Dense(1, activation='sigmoid')(z)

model = Model(model_input, model_output)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 449)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 449, 300)     99066600    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 448, 512)     307712      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 447, 512)     461312      embedding[0][0]                  
______________________________________________________________________________________________

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('CNN_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

history = model.fit(train_X, train_y,
                    batch_size=64,
                    epochs=10,
                    validation_data=(val_X, val_y),
                    verbose=2,
                    callbacks=[es, mc])

Epoch 1/10
14694/14694 - 1114s - loss: 0.1617 - acc: 0.9441 - val_loss: 0.1435 - val_acc: 0.9479

Epoch 00001: val_acc improved from -inf to 0.94789, saving model to CNN_model.h5
Epoch 2/10
14694/14694 - 1080s - loss: 0.1467 - acc: 0.9481 - val_loss: 0.1364 - val_acc: 0.9505

Epoch 00002: val_acc improved from 0.94789 to 0.95051, saving model to CNN_model.h5
Epoch 3/10
14694/14694 - 1077s - loss: 0.1385 - acc: 0.9505 - val_loss: 0.1363 - val_acc: 0.9500

Epoch 00003: val_acc did not improve from 0.95051
Epoch 4/10
14694/14694 - 1073s - loss: 0.1314 - acc: 0.9529 - val_loss: 0.1355 - val_acc: 0.9506

Epoch 00004: val_acc improved from 0.95051 to 0.95059, saving model to CNN_model.h5
Epoch 5/10
14694/14694 - 1072s - loss: 0.1244 - acc: 0.9551 - val_loss: 0.1412 - val_acc: 0.9503

Epoch 00005: val_acc did not improve from 0.95059
Epoch 6/10
14694/14694 - 1072s - loss: 0.1191 - acc: 0.9569 - val_loss: 0.1436 - val_acc: 0.9510

Epoch 00006: val_acc improved from 0.95059 to 0.95100, saving m

In [26]:
loaded_model = load_model('CNN_model.h5')