## Install Dependencies

In [None]:
# !pip install rake-nltk
# !pip install sentence-transformers
# !pip install pandasql

## Import Plugins

In [1]:
import math
import numpy as np
import pandas as pd
from pandasql import sqldf
from rake_nltk import Rake
from sentence_transformers import SentenceTransformer

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/faishal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/faishal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Initiate Sentence Transformers for SBERT and mysql for data search

In [2]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
mysql = lambda q: sqldf(q, globals())

## Helper Functions

In [3]:
def extract_key(text, w, r):
    r.extract_keywords_from_text(text) 
    a = r.get_ranked_phrases()
    t = ""
    cut = min(w, len(a))
    count = 0
    for phrase in a:
        if count >= cut:
            break
        t = t + ' ' + phrase
        count += 1
    return t

def scoring_bert(e1, e2):
    cosine = np.dot(e1, e2) / (np.linalg.norm(e1) * np.linalg.norm(e2))
    score = 1./(1 + math.exp(-100*(cosine - 0.95)))
    #print(score)
    return score

def filter_bert(res, query, w, num, num_bert):
    r = Rake()
    text_corpus = []
    score_arr = []
    query = str(query)
    query_corpus = []
    query_corpus.append(query)
    res_new = []

    for index, row in res.iterrows():
        text = row['text']
        key_text = extract_key(text, w, r)
        text_corpus.append(key_text)

    text_emb = np.array(model.encode(text_corpus))
    query_emb = np.array(model.encode(query_corpus))
    query_emb = query_emb[0]

    # print("text emb size: ", text_emb.shape)
    # print("query emb size: ", query_emb.shape)

    #for t, emb in zip(text_corpus, text_emb):
    for emb in text_emb:
        score = scoring_bert(query_emb, emb)
        score_arr.append(score)
        
    score_arr = np.array(score_arr)
    max_ind = score_arr.argsort()[-num_bert:][::-1]

    for i in max_ind:
        res_new.append(res.iloc[i])
        
    return pd.DataFrame(res_new)

def add_rank(result):
    ranked_result = pd.DataFrame()
    ranked_result['queries_id'] = result['queries_id']
    ranked_result['text_id'] = result['text_id']
    ranked_result['rank'] = ranked_result.apply(lambda x: result.index.get_indexer_for(result[result.text_id == x['text_id']].index)[0] + 1, axis=1)
    
    return ranked_result

## Import query data, passage data, and DeepCT initial ranking

For this research we use MSMARCO dev data that contains 6980 queries with each query has 1000 candidate passage result

In [4]:
queries = pd.read_csv('data/queries/queries.dev.tsv', header=None, sep='\t', 
                      names=["id", "query"])

#5000000
texts1 = pd.read_csv('data/collection.tsv.1', header=None, sep='\t', names=["id", "text"])
#3841823
texts2 = pd.read_csv('data/collection.tsv.2', header=None, sep='\t', names=["id", "text"])
texts = texts1.append(texts2, ignore_index = True)

initial_record = pd.read_csv('data/deepct.dev.small.top1k.tsv', header=None, sep='\t', 
                             names=["queries_id", "text_id", "rank"])

queries_id = initial_record.queries_id.unique()

In [5]:
queries

Unnamed: 0,id,query
0,1048578,cost of endless pools/swim spa
1,1048579,what is pcnt
2,1048580,what is pcb waste
3,1048581,what is pbis?
4,1048582,what is paysky
...,...,...
101088,480594,"price of copper by ounce, pound"
101089,524271,trazodone for dogs side effects
101090,1048565,who plays sebastian michaelis
101091,1048570,what is pearls before swine?


In [6]:
texts

Unnamed: 0,id,text
0,0,The presence of communication amid scientific ...
1,1,The Manhattan Project and its atomic bomb help...
2,2,Essay on The Manhattan Project - The Manhattan...
3,3,The Manhattan Project was the name for a proje...
4,4,versions of each volume as well as complementa...
...,...,...
8841818,8841818,When metal salts emit short wavelengths of vis...
8841819,8841819,Thousands of people across the United States w...
8841820,8841820,"The recipe that creates blue, for example, inc..."
8841821,8841821,"On Independence Days of yore, old-timey crowds..."


In [7]:
initial_record

Unnamed: 0,queries_id,text_id,rank
0,1048585,7187157,1
1,1048585,7187155,2
2,1048585,7187156,3
3,1048585,7187160,4
4,1048585,7617404,5
...,...,...,...
6973625,1048565,3250398,996
6973626,1048565,7123172,997
6973627,1048565,1833829,998
6973628,1048565,2358006,999


## Experiments

We use all 6980 queries and 1000 sample query for experiment

In [9]:
#Process only x query
queries_id_1 = queries_id[:1]
queries_id_100 = queries_id[:100]
queries_id_1000 = queries_id[:1000]

minw_bert = 100
num_res_bm25 = 180
num_res_bert = 100

In [None]:
all_result = pd.DataFrame()

#Use only if process 1000 query for DeepCT MRR@10 evaluation
# deepct_res = pd.DataFrame()

#Process all 6980 queries
num_of_queries = queries_id

#Process only 1000 queries
# num_of_queries = queries_id_1000

progress = 0
for query_id in num_of_queries:    
    #Get records per query
    text_by_query = initial_record.loc[initial_record['queries_id'] == query_id]
    
    #Use only if process 1000 query for DeepCT MRR@10 evaluation
    # deepct_res = deepct_res.append(text_by_query)

    #Get top 100 for bert refining
    text_by_query = text_by_query.head(100)
    
    #Add corresponding text to dataframe
    text_by_query['text'] = text_by_query.apply(lambda x: texts.loc[texts.id == x['text_id'], 'text'].values[0], axis=1)
    
    #Get query string
    get_query_sql = "SELECT query FROM queries WHERE id == "+str(query_id)+";"
    query = mysql(get_query_sql)['query'][0]
    
    #Refine initial results with bert
    res_bert_bm = filter_bert(text_by_query, query, minw_bert, num_res_bm25, num_res_bert)
    res_refined_ranked = add_rank(res_bert_bm)
    all_result = all_result.append(res_refined_ranked)
    
    progress = progress + 1
    print('progress: '+ str(progress) + '/' + str(len(num_of_queries)))

## Save result for evaluation

In [None]:
all_result.to_csv("data/bert100_all_res.tsv", sep="\t", index=False, header=None)
# deepct_res.to_csv("data/deepct_q100_res.tsv", sep="\t", index=False, header=None)