In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras import Model

Using TensorFlow backend.


In [0]:
pd.options.display.max_rows=15
%matplotlib inline

In [3]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
train_file_path = os.path.join('content/drive/My Drive/Colab Notebooks/quora/train.csv')
test_file_path = os.path.join('content/drive/My Drive/Colab Notebooks/quora/test.csv')

In [0]:
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/quora/test.csv')
df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/quora/train.csv')

In [0]:
train_df, val_df = train_test_split(df_train, test_size=0.1, random_state=2018)

In [0]:
embed_size = 300
max_features = 50000
maxlen = 100

In [0]:
train_x = train_df['question_text'].fillna('_na_').values
val_x = val_df['question_text'].fillna('_na_')
test_x = df_test['question_text'].fillna('_na_')

In [0]:
tokenizer = Tokenizer(num_words=max_features)

In [0]:
tokenizer.fit_on_texts(list(train_x))

In [0]:
train_x = tokenizer.texts_to_sequences(train_x)
val_x = tokenizer.texts_to_sequences(val_x)
test_x = tokenizer.texts_to_sequences(test_x)

In [0]:
train_x = pad_sequences(train_x, maxlen=maxlen)
val_x = pad_sequences(val_x, maxlen=maxlen)
test_x = pad_sequences(test_x, maxlen=maxlen)

In [0]:
train_y = train_df['target'].values
val_y = val_df['target'].values

In [0]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences= True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inp, x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total para

In [16]:
model.fit(train_x, train_y, batch_size=512, epochs=2, validation_data=(val_x, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ffa683636a0>

In [17]:
pred_noemb_val_y = model.predict([val_x], batch_size=1024, verbose=1)



In [18]:
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5792811839323467
F1 score at threshold 0.11 is 0.5876970427983197
F1 score at threshold 0.12 is 0.5970581965465785
F1 score at threshold 0.13 is 0.6042582836243305
F1 score at threshold 0.14 is 0.6100248844649839
F1 score at threshold 0.15 is 0.6177350621089853
F1 score at threshold 0.16 is 0.6238523644752019
F1 score at threshold 0.17 is 0.6284532982522082
F1 score at threshold 0.18 is 0.6339200306102927
F1 score at threshold 0.19 is 0.6386285270263706
F1 score at threshold 0.2 is 0.642413487133984
F1 score at threshold 0.21 is 0.6458999999999999
F1 score at threshold 0.22 is 0.6491610483094238
F1 score at threshold 0.23 is 0.6515377111464805
F1 score at threshold 0.24 is 0.6531121626540483
F1 score at threshold 0.25 is 0.6550291813449708
F1 score at threshold 0.26 is 0.655154091392136
F1 score at threshold 0.27 is 0.6571674373588559
F1 score at threshold 0.28 is 0.6590587979567438
F1 score at threshold 0.29 is 0.6596421907584238
F1 score at threshold 0.

In [19]:
pred_noemb_test_y = model.predict([test_x], batch_size=1024, verbose=1)



In [0]:
del model, inp, x

In [0]:
import gc

In [22]:
gc.collect()

150

In [23]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2019-01-23 06:51:51--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2019-01-23 06:51:51--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: ‘glove.840B.300d.zip’


2019-01-23 06:53:36 (19.8 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]



In [24]:
!unzip 'glove.840B.300d.zip'

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [0]:
EMBEDDING_FILE = 'glove.840B.300d.txt'

In [0]:
def get_coefs(word, *arr):
  return word, np.asarray(arr, dtype='float32')

In [0]:
embeddings_index = dict(get_coefs(*o.split(' ')) for o in open(EMBEDDING_FILE))

In [0]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [0]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [42]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(250, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 500)          1102000   
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 500)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                8016      
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
Total para

In [0]:
model.fit(train_x, train_y, batch_size=512, epochs=2, validation_data=(val_x, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2
   8704/1175509 [..............................] - ETA: 27:17 - loss: 0.0961 - acc: 0.9608

In [32]:
pred_glove_val_y = model.predict([val_x], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5884073276737479
F1 score at threshold 0.11 is 0.5989509616185164
F1 score at threshold 0.12 is 0.6087252897068848
F1 score at threshold 0.13 is 0.6166753608068163
F1 score at threshold 0.14 is 0.6234053862508858
F1 score at threshold 0.15 is 0.6299531700288185
F1 score at threshold 0.16 is 0.6350338270250503
F1 score at threshold 0.17 is 0.6404671424599129
F1 score at threshold 0.18 is 0.6448400582460425
F1 score at threshold 0.19 is 0.6487052551408986
F1 score at threshold 0.2 is 0.6520838352204288
F1 score at threshold 0.21 is 0.6551270174069921
F1 score at threshold 0.22 is 0.6586052021124327
F1 score at threshold 0.23 is 0.6614181075290249
F1 score at threshold 0.24 is 0.6627421383647799
F1 score at threshold 0.25 is 0.6646341463414634
F1 score at threshold 0.26 is 0.666393400625096
F1 score at threshold 0.27 is 0.6675974764711966
F1 score at threshold 0.28 is 0.670214986432895
F1 score at threshold 0.29 is 0.6723583662714098
F1 score at threshold 0.

In [33]:
pred_glove_test_y = model.predict([test_x], batch_size=1024, verbose=1)



In [0]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x

In [35]:
gc.collect()

164

In [0]:
!wget http://nlp.stanford.edu/data/