# Semantic-Question-Matching-Keras

You can download data from: http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv 

Dataset info: https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs 

Blog post about Quora model: https://engineering.quora.com/Semantic-Question-Matching-with-Deep-Learning

In [46]:
import pandas as pd
from nltk import sent_tokenize, word_tokenize
import numpy as np
import os
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.utils import shuffle
import sqmutils.data_utils as du
import time

%load_ext autoreload
%autoreload 2
%matplotlib inline 
# fix random seed for reproducibility
seed = 7

pathToDataset = "/home/elkhand/datasets/Quora/data/quora_duplicate_questions.tsv"
embedding_path = "/home/elkhand/datasets/glove-vectors/glove.twitter.27B.100d.txt"
train_dataset_path = pathToDataset


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data exploration

In [6]:
df = pd.read_csv(pathToDataset, sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Create train/dev/test set
- train 98%
- dev 1%
- test 1%

In [40]:
# First split dataset into train(99%) and test(1%)
config = du.get_config(train_dataset_path,test_size=0.01)
trainDataset, testDataset = du.create_train_test_split(config)

print("\n")

# Second, split train dataset into train (98%) and val (1%) datasets
trainDataset, valDataset = du.create_train_test_split_from_df(trainDataset, config)


print("\n","trainDataset Label distribution: ", trainDataset.groupby('is_duplicate').is_duplicate.count(), "\n")
print("\n","valDataset Label distribution: ", valDataset.groupby('is_duplicate').is_duplicate.count() , "\n")
print("\n","testDataset Label distribution: ", testDataset.groupby('is_duplicate').is_duplicate.count() , "\n")

config
 {'train_dataset_path': '/home/elkhand/datasets/Quora/data/quora_duplicate_questions.tsv', 'val_dataset_path': None, 'test_size': 0.01, 'seed': 7, 'is_debug_on': False} 




 trainDataset Label distribution:  is_duplicate
0    249949
1    146292
Name: is_duplicate, dtype: int64 


 valDataset Label distribution:  is_duplicate
0    2525
1    1478
Name: is_duplicate, dtype: int64 


 testDataset Label distribution:  is_duplicate
0    2550
1    1493
Name: is_duplicate, dtype: int64 



# Load embeddings

We will be using GloVe twitter 100D

In [47]:
print("word vectors path", embedding_path)
start = time.time()
w2v = du.load_embedding(embedding_path)
end = time.time()
print("Total time passed: ", (end-start))

word vectors path /home/elkhand/datasets/glove-vectors/glove.twitter.27B.100d.txt
embedding size : 1193514
embedding dimension : (100,)
Total time passed:  55.98992872238159


## Train and evaluate model

In [49]:
def _build_model(num_of_classes, config): 
    model = Sequential()
    model.add(Masking(mask_value=0., input_shape=(None, config['embedding_dimension'])))
    model.add(Bidirectional(GRU(config['hidden_layer_dim'], return_sequences=True, \
                dropout=config['dropout'], recurrent_dropout=config['recurrent_dropout']), merge_mode='concat'))
    model.add(Bidirectional(GRU(config['hidden_layer_dim'], dropout=config['dropout'],\
                                recurrent_dropout=config['recurrent_dropout']), merge_mode='concat'))
    model.add(Dense(num_of_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
       
def train_model(X_train, y_train, X_val, y_val, num_of_classes, config):
    print('X_train shape : %s' % (X_train.shape,))
    print('y_train shape : %s' % (y_train.shape,))
    print('X_val shape : %s' % (X_val.shape,))
    print('y_val shape : %s' % (y_val.shape,))
    print('number of classes : %d' % num_of_classes) 
    model = _build_model(num_of_classes, config)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                                  factor=0.2, 
                                  patience=5, 
                                  min_lr=0.001)
    # checkpoint
    filepath="model/adidas-may-29.h5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
    early_stopping = EarlyStopping(monitor='val_loss', patience=25)#10
    callback_list = [reduce_lr, checkpoint] # , early_stopping
    history = model.fit(x=X_train,
                  y=y_train, 
                  batch_size=config['batch_size'], 
                  epochs=config['nb_epochs'], 
                  verbose=1, 
                  validation_data = (X_val, y_val),
                  shuffle=True,
                  callbacks=callback_list)#
    
    val_acc_list = history.history['val_acc']
    best_val_acc =  max(val_acc_list)
    filename = "intent" 
    filename = "model/" + dt.generate_model_name(filename, best_val_acc) + ".h5"
    os.rename(filepath, filename)
    return history

def train_helper(dfTrain, dfVal, config, shouldShuffleTrainDataset=True):
    class_to_index = {}
    index_to_class = {}
    dfTrain = shuffle(dfTrain)
    X_train, y_train_index, num_of_classes, class_to_index, index_to_class = dt.load_dataset_StratifiedKFold(dfTrain, w2v, config, class_to_index, index_to_class)
    y_train = dt.convert_index_to_one_hot(y_train_index, num_of_classes) 
        
    print("dfTrain.head(10) \n", dfTrain.head(10))
#     print("dfTrain.tail(10) \n", dfTrain.tail(10))
    print("\n","Train label distribution: \n",dfTrain.groupby('label').label.count())
#         print("num_of_classes", num_of_classes)
#         print("class_to_index", class_to_index)
#         print("index_to_class", index_to_class)
        
    print("dfVal.head(10)",dfVal.head(10))
#     print("dfVal.tail(10)",dfVal.tail(10))
        
    X_val, y_val_index, _, _, _ = dt.load_dataset_StratifiedKFold(dfVal, w2v, config, class_to_index, index_to_class)
    y_val = dt.convert_index_to_one_hot(y_val_index, num_of_classes) 
    print("\n","Val label distribution: ",dfVal.groupby('label').label.count())
        
    # Train model
    history = train_model(X_train, y_train, X_val, y_val, num_of_classes, config)
    val_acc_list = history.history['val_acc']
    best_val_acc =  max(val_acc_list)
    return [history, best_val_acc]


def predict(w2v):
    model = keras.models.load_model('model/intent_model_ricodataset.h5')
    sentence = "www.google.com"
    X_train = []
    X_train.append(get_sequence_embedding(sentence.split(" "),w2v, max_seq_len))
    X_train = np.array(X_train)
    print(X_train.shape)
    result = model.predict(X_train)
    print(result)
    print(index_to_class)
    for i in index_to_class:
        print("%s : %.3f%%" % (index_to_class[i], result[0][i] * 100))

def text_pre_processing(document):
    document = text.text_to_word_sequence(document, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'')
    print(document)
    return document


def get_config():
    conf = {}
    conf["embedding_dimension"] = 100# 300 #100
    conf["max_seq_len"] = 40
    conf["dropout"] = 0.3
    conf["recurrent_dropout"] = 0.3
    conf["hidden_layer_dim"] = 40
    conf["batch_size"] = 40
    conf["nb_epochs"] = 100 #300
    print("config\n",conf,"\n")
    return conf
    

def main():
    config = get_config()
    if w2v is not None:
        print('embedding size : %d' % len(w2v))
        print('embedding dimension : %s' % (w2v['apple'].shape,))
        print("Sample words from word2vec: ", list(w2v.keys())[:10], list(w2v.keys())[-10:])
#         history, best_val_acc = train_helper(dfTrain, dfVal, config)
#         pt.plot_model_accuracy(history,"model/", isF1Enabled)
#         print("best_val_acc",best_val_acc)
    # predict()
    # text_pre_processing("hello&nbsp;hi")

if __name__ == "__main__":
    # execute only if run as a script
    main()

config
 {'embedding_dimension': 100, 'max_seq_len': 40, 'dropout': 0.3, 'recurrent_dropout': 0.3, 'hidden_layer_dim': 40, 'batch_size': 40, 'nb_epochs': 100} 

embedding size : 1193514
embedding dimension : (100,)
Sample words from word2vec:  ['<user>', '.', ':', 'rt', ',', '<repeat>', '<hashtag>', '<number>', '<url>', '!'] ['ｶﾞﾘｶﾞﾘ', 'ｷｲ', 'ｹﾞｼｯ', 'ﾃﾍﾍﾟﾛｯ', 'ﾃﾞﾓ', 'ﾊﾞｲﾊﾞｰｲ', 'ﾊﾟﾝﾁ', 'ﾔﾒﾀﾏｴ', 'ﾖｲｼｮｯ', 'ﾟﾟﾟｵﾔｽﾐｰ']
