## Test the USE with a baseline set of questions
Manually curate a list of questions so that there is a range of minor to larger changes

### Import the necessary libraries

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
import re
import json

Using TensorFlow backend.


### Set the paths and files

In [8]:
use_path = "dataset/"
baseline_file = "baseline_questions.csv"
quora_file = "quora_questions.csv"

### Donload the USE module

In [1]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 
embed = hub.Module(module_url)

### Create an interactive session to use later when running the model

In [4]:
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())

### Get the list of baseline questions to test

In [9]:
def get_baseline():
    # Read in the list of qustions from the file
    qs_path = use_path + baseline_file
    base_qs = pd.read_csv(qs_path)
    queries = [q for q in base_qs['Question']]
    return(queries)

### Create a dataframe of quora question pairs and answer labels 

In [10]:
def get_quora_qs():
    quora_path = use_path + quora_file
    quora_qs = pd.read_csv(quora_path)
    return(quora_qs)

### Get the cosine similarity and use placeholders to feed the queries to the USE

In [20]:
sts_input1 = tf.placeholder(tf.string, shape=(None))
sts_input2 = tf.placeholder(tf.string, shape=(None))

# For evaluation we use exactly normalized rather than
# approximately normalized.
sts_encode1 = tf.nn.l2_normalize(embed(sts_input1), axis=1)
sts_encode2 = tf.nn.l2_normalize(embed(sts_input2), axis=1)

cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
clip_cosine_similarities = tf.clip_by_value(cosine_similarities, 0.0, 1.0)
sim_scores = 1.0 - tf.divide(tf.acos(clip_cosine_similarities), 3.14)

def get_scores(session, text_a, text_b):
    """Returns the similarity scores"""
    emba, embb, scores= session.run(
        [sts_encode1, sts_encode2, sim_scores],
        feed_dict={
            sts_input1: text_a,
            sts_input2: text_b
        })
    return(emba, embb, scores)

### Find the best match using cosine similarity
Go through each of the baseline questions and find the best match

In [21]:
quora_df = get_quora_qs()
quora_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
1,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
2,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
3,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
4,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [22]:
results = []
    
queries = get_baseline()
examples = [e for e in quora_df['question2']]
for i, q in enumerate(queries):
    emba, embb, scores = get_scores(session, [q], examples)
    quora_df['cosine_score'] = list(scores)
    sort_df = quora_df.sort_values('cosine_score', ascending=False)
    best = sort_df.head(n=1)
    ans = (best['question2']).tolist()[0]
    results.append([q, ans, (best['question1']).tolist()[0], (best['is_duplicate']).tolist()[0],
                    np.round((best['cosine_score']).tolist()[0], 3)])
    if i % 10 == 0:
        print(f"Processed {i} queries")
df = pd.DataFrame(results, columns=['Query', 'Best Match', 'Original', 'Is Duplicate?', 'Similarity'])

Processed 0 queries
Processed 10 queries
Processed 20 queries
Processed 30 queries


In [1]:
type(scores)

NameError: name 'scores' is not defined

In [30]:
display(df.sort_values('Similarity', ascending=False))

Unnamed: 0,Query,Best Match,Original,Is Duplicate?,Similarity
1,What is meaning of life?,What's are the meaning of life?,What the meaning of this all life?,1,0.944
0,What is purpose of life?,What's the purpose of life? What is life actua...,What is purpose of life?,1,0.929
22,How many months does it take to gain knowledge...,How much time does it take to learn Android ap...,How many months does it take to gain knowledge...,1,0.912
12,How do I use Twitter as a business tool?,How can I use Twitter for business?,How do I use Twitter as a business source?,1,0.905
10,How do I use Twitter as a business source?,How can I use Twitter for business?,How do I use Twitter as a business source?,1,0.903
5,is What purpose of life?,What's the purpose of life? What is life actua...,What is purpose of life?,1,0.897
25,How many months does it take to gain knowledge...,How much time does it take to learn Android ap...,How many months does it take to gain knowledge...,1,0.896
17,How do you make a screenshot on a Mac laptop?,How do I take a screenshot on my MacBook Pro? ...,How do you take a screenshot on a Mac laptop?,1,0.888
24,How much time does it take to create Android a...,How much time does it take to learn Android ap...,How many months does it take to gain knowledge...,1,0.886
23,How many months does it take to develop Androi...,How much time does it take to learn Android ap...,How many months does it take to gain knowledge...,1,0.881


### Write the output to a CSV file

In [29]:
df.to_csv('baseline_quora_test.csv')