Copies [this workflow](https://github.com/jyu-theartofml/kaggle_quora/blob/master/02_LSTM_2Dense_layers.ipynb)

# Set Up

In [1]:
import os
import sys
#set custom path
sys.path.insert(0, os.path.abspath(r'\users\fynn\documents\anaconda\envs\tf_keras_gpu_test\lib\site-packages'))

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

In [3]:
#create deep copy of data to change
data = pd.read_csv('../big_files/train.csv')
data = data.dropna()

In [4]:
target = data['is_duplicate']
data = data.drop('is_duplicate', axis=1)

question1 = list(data['question1'])
question2 = list(data['question2'])

print(len(question1))
print(len(question2))
question1[:5]

404287
404287


['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?']

In [5]:
#fit tokenizer
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(question1+question2)

In [6]:
#transform
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index #unique words in corpus (training and test sets)

print("Words in index: %d" % len(word_index))

Words in index: 95595


In [7]:
#pad out sentences
q1_data = pad_sequences(question1_word_sequences, maxlen=25)
q2_data = pad_sequences(question2_word_sequences, maxlen=25)

#ensure target is int
labels = np.array(target, dtype=int)
#check shapes
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)
Shape of label tensor: (404287,)


In [8]:
X = np.stack((q1_data, q2_data), axis=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, labels, test_size=0.20, random_state=126, stratify=labels)

## Embeddings

[Download glove] 

In [9]:
embeddings_index = {}
f = open('../big_files/glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [10]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## Model

In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GlobalAveragePooling1D,Lambda,Bidirectional, BatchNormalization
from keras.models import Model
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop
from keras import backend as K
from keras.layers.embeddings import Embedding

import keras
keras.__version__

In [12]:
#split training/ validation into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=126, stratify=y_train_val)
#split out questions
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_val = X_val[:,0]
Q2_val = X_val[:,1]

In [13]:
def vec_distance(vects):
    x, y = vects
    return K.sum(K.square(x - y), axis=1, keepdims=True)
#don't use squar root of the sum, it doens't give a good range to feed to the dense layer.

def vec_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


In [14]:
#set nb words to relevent dimensions
nb_words=95595+1
max_sentence_len=25
embedding_layer = Embedding(nb_words,300,
        weights=[embedding_matrix],
        input_length=max_sentence_len,trainable=False)
#dont train this layer!

In [15]:
#build model
lstm_layer =LSTM(128)

sequence_1_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

distance=Lambda(vec_distance, output_shape=vec_output_shape)([x1, y1])
dense1=Dense(16, activation='sigmoid')(distance)
dense1 = Dropout(0.3)(dense1)

bn2 = BatchNormalization()(dense1)
prediction=Dense(1, activation='sigmoid')(bn2)

model = Model([sequence_1_input, sequence_2_input], prediction)

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 25, 300)      28678800    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 128)          219648      embedding[0][0]              

In [16]:
#compile model, set early stop
model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc'])

early_stopping =EarlyStopping(monitor='val_loss', patience=3)

In [17]:
hist=model.fit([Q1_train, Q2_train], y_train, validation_data=([Q1_val, Q2_val], y_val), verbose=1, 
          epochs=10, batch_size=256, shuffle=True,class_weight=None, callbacks=[early_stopping])
#takes long time to initiate
#using dense() layer and sigmoid activation

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


### Evaluation

In [18]:
from keras.models import model_from_json

In [19]:
#save model and weights
# export model to JSON
model_json = model.to_json()
with open("../big_files/lstm_model_distance_128.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../big_files/lstm_model_weights.h5")
print("Saved model to disk")

Saved model to disk


In [20]:
# load json and create model
json_file = open('../big_files/lstm_model_distance_128.json', 'r')
loaded_model_json = json_file.read()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("../big_files/lstm_model_weights.h5")
print("Loaded model from disk")

SystemError: unknown opcode

In [None]:
pred=model.predict([test1_data, test2_data],verbose=1)