In [1]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
tqdm.pandas()

Import training data, test data

In [2]:
train_df = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
test_df = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


Load and improve embedding

In [3]:
import zipfile
from gensim.models import KeyedVectors
import numpy as np

embeddings_path = "../input/quora-insincere-questions-classification/embeddings.zip"
glove = 'glove.840B.300d/glove.840B.300d.txt'
wiki = 'wiki-news-300d-1M/wiki-news-300d-1M.vec'
google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
para = 'paragram_300_sl999/paragram_300_sl999.txt'


def load_embedding(embedding_name):

    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

    with zipfile.ZipFile(embeddings_path) as embeddings_zip:
        print("Found embeddings as a zip file")

        if embedding_name == google:
            return KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

        else:
            embedding = []
            for o in embeddings_zip.open(embedding_name):
                try:
                    if len(o.decode('utf-8')) > 100:
                        embedding.append(get_coefs(*o.decode('utf-8').split(" ")))
                except:
                    pass
        
        return dict(embedding)

In [4]:
n_splits = 5
seed = 31
puncts = '´‘’“”…!#$%&()*+,-./:;<=>?@[\]^_`{|}~"' + "'"

In [5]:
from keras.preprocessing.text import Tokenizer

to_exclude = ''
to_tokenize = puncts

tokenizer = Tokenizer(filters=to_exclude, lower=False)

Split the data into train, validation, test

In [6]:
from sklearn.model_selection import train_test_split, KFold

kfold = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

train_df, val_df = train_test_split(train_df, test_size=0.1)

train_idx, val_idx = list(kfold.split(train_df))[0]
train_df, val_df = train_df.iloc[train_idx], train_df.iloc[val_idx]

train_text = train_df["question_text"].fillna("_na_")
val_text = val_df["question_text"].fillna("_na_")
test_text = test_df["question_text"].fillna("_na_")

train_y = train_df['target'].values
val_y = val_df['target'].values

In [7]:
import re

train_text = train_text.progress_apply(lambda x: re.sub(r'(['+to_tokenize+'])', r' \1', x)).values
val_text = val_text.progress_apply(lambda x: re.sub(r'(['+to_tokenize+'])', r' \1', x)).values
test_text = test_text.progress_apply(lambda x: re.sub(r'(['+to_tokenize+'])', r' \1', x)).values

  0%|          | 0/940407 [00:00<?, ?it/s]

  0%|          | 0/235102 [00:00<?, ?it/s]

  0%|          | 0/375806 [00:00<?, ?it/s]

In [8]:
text = list(train_text) + list(val_text) + list(test_text)

Tokenize the text

In [9]:
%%time
tokenizer.fit_on_texts(text)

CPU times: user 22.1 s, sys: 91.8 ms, total: 22.2 s
Wall time: 22.2 s


Encoding the word to integer

In [10]:
word_index = tokenizer.word_index

Encoding the text to interger sequence

In [11]:
%%time
train_X = tokenizer.texts_to_sequences(train_text)
val_X = tokenizer.texts_to_sequences(val_text)
test_X = tokenizer.texts_to_sequences(test_text)

maxlen = max({len(seq) for seq in train_X} | {len(seq) for seq in val_X} | {len(seq) for seq in test_X})
#maxlen = 600

CPU times: user 18.6 s, sys: 198 ms, total: 18.8 s
Wall time: 18.8 s


Pad the sequence to equalize the length

In [12]:
from keras.preprocessing.sequence import pad_sequences

train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

In [13]:
google_emb_mean, google_emb_std = -0.003527845, 0.13315111

In [14]:
embed_size = 300 # how big is each word vector
max_features = len(word_index) # how many unique words to use (i.e num rows in embedding vector)

Load glove embedding

In [15]:
%%time
google_embedding = load_embedding(google)

Found embeddings as a zip file
CPU times: user 56.1 s, sys: 2.87 s, total: 59 s
Wall time: 1min 18s


Improve the embedding matrix

In [16]:
import nltk
nltk.download('wordnet')

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
s = PorterStemmer()
l = LancasterStemmer()
n = WordNetLemmatizer()

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [17]:
def is_in_emb(word, i, embedding, emb_matrix):

    if word in embedding:
        emb_matrix[i] = embedding[word]
        return True
    
    tmp = word.lower()
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    tmp = word.upper()
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True

    tmp = word.capitalize()
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True

    tmp = s.stem(word)
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    tmp = l.stem(word)
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    tmp = n.lemmatize(word)
    if tmp in embedding:
        emb_matrix[i] = embedding[tmp]
        return True
    
    return False

In [18]:
google_emb_matrix = np.random.normal(google_emb_mean, google_emb_std, (max_features+1, embed_size))
google_oov = {}
# double_continue = False
for word, i in tqdm(word_index.items()):
    #i -= 1
    assert i >= 0

    if is_in_emb(word, i, google_embedding, google_emb_matrix): continue

    tmp = re.sub('[0-9]{5,}', '#####', word)
    tmp = re.sub('[0-9]{4}', '####', tmp)
    tmp = re.sub('[0-9]{3}', '###', tmp)
    tmp = re.sub('[0-9]{2}', '##', tmp)
    if is_in_emb(tmp, i, google_embedding, google_emb_matrix): continue

    # tmp = word
    for punct in puncts:
        tmp = tmp.replace(punct, '')
    if is_in_emb(tmp, i, google_embedding, google_emb_matrix): continue

    else:
        google_oov[word] = i

del google_embedding
print('percentage of oov of google: {:.2f}%'.format(len(google_oov) / max_features * 100))

  0%|          | 0/329968 [00:00<?, ?it/s]

percentage of oov of google: 22.39%


In [19]:
google_emb_matrix.shape

(329969, 300)

Use 1D CNN

In [20]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
filter_sizes=[3,4,5]
num_filters = 512
drop = 0.5

In [21]:
model_input = Input(shape=(maxlen,))
z = Embedding(len(word_index)+1,
              google_emb_matrix.shape[1],
              weights=[google_emb_matrix],
              input_length=maxlen,
              trainable=False)(model_input)

conv_blocks = []

for sz in filter_sizes:
  conv = Conv1D(filters = num_filters,
                kernel_size = sz,
                padding = "valid",
                activation = "relu",
                strides = 1)(z)
  conv = GlobalMaxPooling1D()(conv)
  conv = Flatten()(conv)
  conv_blocks.append(conv)

z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(drop)(z)
z = Dense(128, activation="relu")(z)
model_output = Dense(1, activation='sigmoid')(z)

model = Model(model_input, model_output)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 449)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 449, 300)     98990700    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 447, 512)     461312      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 446, 512)     614912      embedding[0][0]                  
______________________________________________________________________________________________

In [22]:
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
#mc = ModelCheckpoint('CNN_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

history = model.fit(train_X, train_y,
                    batch_size=64,
                    epochs=5,
                    validation_data=(val_X, val_y))

                    #verbose=2)
                    #callbacks=[es, mc])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
score = model.evaluate(val_X, val_y)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.10258807986974716
Test accuracy: 0.9595112204551697


In [24]:
pred = model.predict(test_X, batch_size = 64, verbose=1)



In [25]:
res = [0 if x < 0.3 else 1 for x in pred]

In [26]:
submission = pd.DataFrame(
    {'qid':test_df['qid'][:len(res)], 'prediction':res},
    columns = ['qid', 'prediction']
)
submission.to_csv('./submission.csv', index=False)