In [2]:
%matplotlib inline
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
from matplotlib import gridspec
import os
import time
from preprocessing import PreProcessing
from model import SiameseNetwork
import pandas as pd
from tensorflow.contrib import learn
import fasttext
#import faiss
import re

In [3]:
model_path = './model_siamese_network/'

In [4]:
question_pairs = pd.read_csv('./data_repository/questions.csv')

In [5]:
question_pairs.columns

Index(['question1', 'is_duplicate', 'question2'], dtype='object')

In [6]:
question_pairs.fillna("",inplace=True)

In [7]:
selected_cols = ['question1','question2','is_duplicate']
question_pairs = question_pairs[selected_cols]

In [8]:
def preprocess(x):
    try:
        tk_x = x.lower()

        # list of characters which needs to be replaced with space
        space_replace_chars = ['?', ':', ',', '"', '[', ']', '~', '*', ';', '!', '?', '(', ')', '{', '}', '@', '$',
                               '#', '.', '-', '/']
        tk_x = tk_x.translate({ord(x): ' ' for x in space_replace_chars})

        non_space_replace_chars = ["'"]
        tk_x = tk_x.translate({ord(x): '' for x in non_space_replace_chars})

        # remove non-ASCII chars
        tk_x = ''.join([c if ord(c) < 128 else '' for c in tk_x])

        # replace all consecutive spaces with one space
        tk_x = re.sub('\s+', ' ', tk_x).strip()

        # find all consecutive numbers present in the word, first converted numbers to * to prevent conflicts while replacing with numbers
        regex = re.compile(r'([\d])')
        tk_x = regex.sub('*', tk_x)
        nos = re.findall(r'([\*]+)', tk_x)
        # replace the numbers with the corresponding count like 123 by 3
        for no in nos:
            tk_x = tk_x.replace(no, "<NUMBER>", 1)

        return tk_x.strip().lower()
    except:
        return ""

In [9]:
question_pairs['question1'] = question_pairs['question1'].apply(preprocess)
question_pairs['question2'] = question_pairs['question2'].apply(preprocess)

In [10]:
question_pairs = question_pairs.apply(lambda x: x.astype(str).str.lower())

In [11]:
question_pairs = question_pairs.drop_duplicates('question2')

In [12]:
question_pairs.head(10)

Unnamed: 0,question1,question2,is_duplicate
0,i have a question about cosentyx we had a powe...,effects of sacubitril valsartan versus valsart...,0
1,dr requested copies of both paradigm hf and pi...,angiotensin neprilysin inhibition versus enala...,1
2,dr trimmer initiated and asked for jacinda bla...,cosentyx use in patients with a medical histor...,0
3,received <number> <number> information on givi...,package insert myfortic,0
4,hcp request for full text of the following two...,medical literature request,1
5,customer has requested copies of the following...,angiotensin neprilysin inhibition in heart fai...,1
6,looking for information on safety of taking co...,cosentyx overdose,1
7,were there any subsets of patients in the para...,entresto subgroup analysis for the primary end...,1
8,me has a <number> yo caucasian female who repo...,cosentyx relevant medical history of neoplasm ...,1
9,dr requested a face to face discussion regardi...,request for a medical science liaison msl,1


In [13]:
# load vocab_processor
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(model_path+'vocab')
item_db = np.asarray(list(vocab_processor.fit_transform(list(question_pairs['question2']))))

Instructions for updating:
Please use tensorflow/transform or tf.data.


In [14]:
embeddings_model = fasttext.load_model(model_path+"ft_skipgram_ws5_dim64.bin")
embeddings_lookup = []
for word in list(vocab_processor.vocabulary_._mapping):
    try:
        embeddings_lookup.append(embeddings_model[str(word)])
    except:
        pass
embeddings_lookup_ = np.asarray(embeddings_lookup)



In [15]:
print('# of items to be indexed: \t',item_db.shape[0])
print('Embeddings dimension: \t\t',item_db.shape[1])

# of items to be indexed: 	 848
Embeddings dimension: 		 16


## Siamese Network

In [16]:
# Model Hyperparameters
embedding_dim = 64

In [17]:
def model_output(feed_data):
    checkpoint_file = tf.train.latest_checkpoint(model_path)
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            anchor_input = graph.get_operation_by_name("left_input").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name("output/output_embedding").outputs[0]

            # Collect the predictions here
            all_predictions = []

            batch_predictions = sess.run(predictions, {anchor_input: feed_data})
    return batch_predictions

In [18]:
similar_pairs_list = list(question_pairs['question2'])

## Find k nearest neighbour using cosine similarity

In [19]:
# Compute Vector representation for each training images and normalize those
def generate_db_normed_vectors():
    train_vectors = model_output(item_db)
    normalized_train_vectors = train_vectors/np.linalg.norm(train_vectors,axis=1).reshape(-1,1)
    return normalized_train_vectors

In [20]:
# Find k nearest neighbour using cosine similarity
def find_k_nn(normalized_train_vectors,vec,k):
    dist_arr = np.matmul(normalized_train_vectors, vec.T)
    print(-1*np.sort(-dist_arr.flatten())[:k])
    print(max(dist_arr.flatten()))
    return np.argsort(-dist_arr.flatten())[:k]

In [21]:
normalized_training_vectors = generate_db_normed_vectors()

OSError: File None.meta does not exist.

In [21]:
# Building faiss index
db_index = faiss.IndexFlatIP(embedding_dim)  # 
db_index.add(normalized_training_vectors)  # add vectors to the index
print('DB indexing done...')

DB indexing done...


In [22]:
def get_top_k_item(query, k=1):
    query = [query,'milk']
    stime = time.time()
    query = [query[0].lower(),query[1].lower()]
    input_queries = np.asarray(list(vocab_processor.fit_transform(query)))
    search_vector = model_output([input_queries[0]])
    normalized_search_vec = search_vector/np.linalg.norm(search_vector)
    s_time = time.time()
    # candidate_index_i = find_k_nn(normalized_training_vectors, normalized_search_vec, k)
    _, candidate_index = db_index.search(normalized_search_vec,k)
    candidate_index = candidate_index[0]
    print('Total time to find nn: {:0.2f} ms'.format((time.time()-s_time)*1000))    
    print('------------------------------------------------------')
    print('Query: ',query[0])
    print('------------------------------------------------------')
    return candidate_index

In [23]:
def get_pair_wise_similarity(_left,_right):
    similarity_score = []
    left_queries = np.asarray(list(vocab_processor.fit_transform(_left)))
    right_queries = np.asarray(list(vocab_processor.fit_transform(_right)))
    left_vectors = model_output(left_queries)
    right_vectors = model_output(right_queries)
    
    normalized_left_vectors = left_vectors/np.linalg.norm(left_vectors,axis=1).reshape(-1,1)
    normalized_right_vectors = right_vectors/np.linalg.norm(right_vectors,axis=1).reshape(-1,1)
    for i in range(0,len(normalized_left_vectors)):
        similarity_score.append(np.matmul(normalized_left_vectors[i],normalized_right_vectors[i].T))
    return similarity_score

In [24]:
question1 = "How could Quora attract initial users"
question2 = "When did Quora start and how did it attract users"
similarity_score = get_pair_wise_similarity([question1.lower()],[question2.lower()])
print('Similarity Score: ', similarity_score)

INFO:tensorflow:Restoring parameters from ./model_siamese_network/model.ckpt
INFO:tensorflow:Restoring parameters from ./model_siamese_network/model.ckpt
Similarity Score:  [0.85194063]


# Query Search [Most similar k questions]

In [25]:
query = "Is it healthy to eat egg whites every day"
candidate_index = get_top_k_item(query.lower(), 10)
for index in candidate_index:
    print(similar_pairs_list[index])

INFO:tensorflow:Restoring parameters from ./model_siamese_network/model.ckpt
Total time to find nn: 5.83 ms
------------------------------------------------------
Query:  is it healthy to eat egg whites every day
------------------------------------------------------
is it bad for health to eat eggs every day
is it healthy to eat once a day
is it unhealthy to eat bananas every day
is it healthy to eat bread every day
is it healthy to eat fish every day
what high protein foods are good for breakfast
how do you drink more water every day
what will happen if i drink a gallon of milk every day
is it healthy to eat one chicken every day
is it healthy to eat a whole avocado every day


                           ---------------------- *** --------------------