In [29]:
import sys
import lucene

from java.io import File
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, StringField, FieldType
from org.apache.lucene.search import IndexSearcher, BooleanClause
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions, IndexReader, DirectoryReader
from org.apache.lucene.queryparser.classic import MultiFieldQueryParser, QueryParserBase
from org.apache.lucene.store import SimpleFSDirectory, FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.search.similarities import ClassicSimilarity, BM25Similarity, LMDirichletSimilarity, BooleanSimilarity
import numpy as np
import pandas as pd
import string
import time
import re


In [30]:
#Loading ground truth table
pd_grdtrth = pd.read_csv('data/cleaned_grnd_truth.csv')

In [31]:
grd_trth_list = pd_grdtrth[['qid1','qid2','label']].values.tolist()

In [32]:
grd_tup_list = [(each[0], each[1], each[2]) for each in grd_trth_list]


In [33]:
#indexer function
def indexing_docs(file_path,similarity,index_pth_str="index/"):
    lucene.initVM()
    indexPath = File(index_pth_str).toPath()
    indexDir = FSDirectory.open(indexPath)
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writerConfig.setSimilarity(similarity)
    writer = IndexWriter(indexDir, writerConfig)
   
    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(False)
    t1.setIndexOptions(IndexOptions.NONE)
   
    t2 = FieldType()
    t2.setStored(False)
    t2.setTokenized(True)
    t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
   
    df_file = pd.read_csv(file_path)
    cntr = 0
    for i, row in df_file.iterrows():
        doc = Document()
        doc.add(Field("qid", row['qid'], t1))
        doc.add(Field("qn_title", row['qn_title'], t1))
        doc.add(Field("qn_link", row['qn_link'], t1))
        doc.add(Field("qns_title_processed", row['qns_title_processed'], t2))
        doc.add(Field("qns_body_processed", row['qns_body_processed'], t2))
        doc.add(Field("ans_body_processed", row['ans_body_processed'], t2))
        writer.addDocument(doc)
        cntr += 1
    
    print("Indexing Successful")   
    writer.close()
    indexDir.close()

In [34]:
#retreiver function
def retriever_fn(search_string, similarity, index_pth_str="index/"):
    
    analyzer = StandardAnalyzer()
    indexPath = File(index_pth_str).toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    searcher = IndexSearcher(reader)
    searcher.setSimilarity(similarity)

    fields = ['qns_title_processed', 'qns_body_processed', 'ans_body_processed']
    flags = [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD]
    query_parser = MultiFieldQueryParser(fields, analyzer)
    query = query_parser.parse(search_string, fields, flags, analyzer)
    MAX = 50
    start_time = time.time()
    hits = searcher.search(query, MAX)
    retrieval_time = time.time() - start_time
    indexDir.close()
    
    results_list = []
    for ind,hit in enumerate(hits.scoreDocs):
        doc = searcher.doc(hit.doc)
        results_list.append((int(doc.get("qid")), float(hit.score)))
        
    return results_list, retrieval_time



In [35]:
def precision_at_k(doc_ranking, query_id, qrels, k=5):

  
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score

    qrels_query = [qrel for qrel in qrels if qrel[0] == query_id] # iterate through the relevance judgements and return rows which are relevant to given query
    relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1] # retrieve the ids of documents that have positive relevance judgements (i.e relevant documents)
    non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0] # retrieve the ids of documents that have 0 relevance judgements (i.e non relevant documents)

    TP = len(set(retrieved) & set(relevant_doc_ids)) # intersection between retrieved documents and relevant documents. num of docs in intersection = TP (positive examples that are correctly identified)
    FP = len(set(retrieved) & set(non_relevant_doc_ids)) # intersection between retrieved documents and non relevant documents. num of docs in interesetion is FP (negative examples that are incorrectly identifed are positive)

    if TP+FP >0:
        precision = TP / (TP + FP)
       
    else:
        precision = 0
        

    return TP, FP, precision

In [36]:
def f1_score_at_k(doc_ranking, query_id, qrels, k=5):
  
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
  
    qrels_query = [qrel for qrel in qrels if qrel[0] == query_id] # iterate through the relevance judgements and return rows which are relevant to given query
    relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1] # retrieve the ids of documents that have positive relevance judgements (i.e relevant documents)
    non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0] # retrieve the ids of documents that have 0 relevance judgements (i.e non relevant documents)

    TP = len(set(retrieved) & set(relevant_doc_ids)) # intersection between retrieved documents and relevant documents. num of docs in intersection = TP (positive examples that are correctly identified)
    FP = len(set(retrieved) & set(non_relevant_doc_ids)) # intersection between retrieved documents and non relevant documents. num of docs in interesetion is FP (negative examples that are incorrectly identifed are positive)
    FN = len(set(relevant_doc_ids) - set(retrieved)) # relevance docs minus the retrieved docs equal FN (positive examples that are incorrectly identified as negative)

    if TP+FP >0:
        precision = TP / (TP + FP)
    else:
        precision = 0
        
    if TP+FN >0:
        recall = TP / (TP + FN)
    else:
        recall = 0
        
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)  
    else:
        f1 = 0
  
    return f1, recall

In [37]:
pd_test = pd.read_csv('data/test_data.csv')

In [38]:
test_list = pd_test[['qid','query']].values.tolist()

In [39]:
#Test - 1

lucene.initVM()
sim_measure = ClassicSimilarity()
index_pth = "index7/"
indexing_docs("data/cleaned_records.csv",sim_measure,index_pth)

Indexing Successful


In [40]:
k = 5
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))
    

Average Precision at 5 = 0.5307017543859649
Average Recall at 5 = 0.6052631578947368
Average F-1 Score at 5 = 0.5451127819548872
Average Retrieval Time =  0.0011686776813707854


In [41]:
k = 10
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 10 = 0.5175438596491228
Average Recall at 10 = 0.6052631578947368
Average F-1 Score at 10 = 0.5350877192982457
Average Retrieval Time =  0.0010557802099930612


In [42]:
#Test - 2

lucene.initVM()
sim_measure = BooleanSimilarity()
index_pth = "index8/"
indexing_docs("data/cleaned_records.csv",sim_measure,index_pth)

Indexing Successful


In [43]:
k = 5
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 5 = 0.5263157894736842
Average Recall at 5 = 0.48245614035087714
Average F-1 Score at 5 = 0.4982456140350877
Average Retrieval Time =  0.0004878671545731394


In [44]:
k = 10
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 10 = 0.5131578947368421
Average Recall at 10 = 0.5
Average F-1 Score at 10 = 0.5012531328320802
Average Retrieval Time =  0.0005195642772473787


In [45]:
#Test - 3

lucene.initVM()
sim_measure = LMDirichletSimilarity()
index_pth = "index9/"
indexing_docs("data/cleaned_records.csv",sim_measure,index_pth)

Indexing Successful


In [46]:
k = 5
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 5 = 0.3815789473684211
Average Recall at 5 = 0.42105263157894735
Average F-1 Score at 5 = 0.39598997493734334
Average Retrieval Time =  0.0010156882436651933


In [47]:
k = 10
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 10 = 0.42105263157894735
Average Recall at 10 = 0.4473684210526316
Average F-1 Score at 10 = 0.42105263157894735
Average Retrieval Time =  0.0018329620361328125


In [48]:
#Test - 4

lucene.initVM()
sim_measure = BM25Similarity()
index_pth = "index10/"
indexing_docs("data/cleaned_records.csv",sim_measure,index_pth)

Indexing Successful


In [49]:
k = 5
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 5 = 0.4868421052631579
Average Recall at 5 = 0.5526315789473685
Average F-1 Score at 5 = 0.5012531328320802
Average Retrieval Time =  0.0018689381448846114


In [50]:
k = 10
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 10 = 0.5175438596491228
Average Recall at 10 = 0.6052631578947368
Average F-1 Score at 10 = 0.5350877192982457
Average Retrieval Time =  0.0007659008628443667


In [51]:
#Test - 5

lucene.initVM()
sim_measure = BM25Similarity(1.0,0.5)
index_pth = "index11/"
indexing_docs("data/cleaned_records.csv",sim_measure,index_pth)

Indexing Successful


In [52]:
k = 5
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 5 = 0.5131578947368421
Average Recall at 5 = 0.5526315789473685
Average F-1 Score at 5 = 0.518796992481203
Average Retrieval Time =  0.00046836702447188526


In [53]:
k = 10
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 10 = 0.5175438596491228
Average Recall at 10 = 0.6052631578947368
Average F-1 Score at 10 = 0.5350877192982457
Average Retrieval Time =  0.0009924738030684622


In [54]:
#Test - 6

lucene.initVM()
sim_measure = BM25Similarity(6.0,0.5)
index_pth = "index12/"
indexing_docs("data/cleaned_records.csv",sim_measure,index_pth)

Indexing Successful


In [55]:
k = 5
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 5 = 0.4605263157894737
Average Recall at 5 = 0.5
Average F-1 Score at 5 = 0.4661654135338346
Average Retrieval Time =  0.0008442276402523643


In [56]:
k = 10
table = str.maketrans(dict.fromkeys(string.punctuation))
pr_list =[]
rec_list = []
f1_list = []
rt_time = []

for ind, each in enumerate(test_list):
    new_str = each[1].translate(table)
    retreived_docs, retrieval_time = retriever_fn(new_str, sim_measure, index_pth)
    rt_time.append(retrieval_time)
    tp, fp, precision = precision_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    pr_list.append(precision)
    f1_score, recall = f1_score_at_k(retreived_docs, each[0], grd_tup_list, k=k)
    f1_list.append(f1_score)
    rec_list.append(recall)
    
    
print('Average Precision at {} = {}'.format(k, sum(pr_list)/float(len(pr_list))))
print('Average Recall at {} = {}'.format(k, sum(rec_list)/float(len(rec_list))))
print('Average F-1 Score at {} = {}'.format(k, sum(f1_list)/float(len(f1_list))))
print('Average Retrieval Time = ', sum(rt_time)/float(len(rt_time)))

Average Precision at 10 = 0.46491228070175433
Average Recall at 10 = 0.5526315789473685
Average F-1 Score at 10 = 0.48245614035087714
Average Retrieval Time =  0.0006001497569837069
