# Installation

In [None]:
!pip install beir rank_bm25 tensorflow-text wikipedia python-docx

In [None]:
import nltk, pickle, ast, time, spacy
from os import listdir
from os.path import isfile, join
from docx import *
import pandas as pd
import numpy as np

path='./'

In [None]:
from tqdm import tqdm
import wikipedia as wp
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords, words
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")

In [None]:
def stemlemma(text):
  return ' '.join([stemmer.stem(wordnet_lemmatizer.lemmatize(word)) for word in word_tokenize(text.lower())])
def saveObj(ob,filename):
    filehandler = open(filename, 'wb') 
    pickle.dump(ob, filehandler)
    filehandler.close()
def loadObj(filename):
    filehandler = open(filename, 'rb') 
    obj=pickle.load(filehandler)    
    filehandler.close()
    return obj
def openFiles(files,path):
  li=[]
  for f in files:
    with open(path+f,"r") as tf:
      li.append(tf.read().replace('\n', ''))
  return li

In [None]:
GT=loadObj(path+"GTv6.obj")

In [None]:
for doc in GT.Doc.unique():
  cs=GT.CS[GT[(GT.Domain==False)&(GT.Doc==doc)].index[0]]
  for i in GT[(GT.Domain==False)&(GT.Doc==doc)].index:
    GT.at[i,'CS']=cs

In [None]:
for i in GT.index:
  C=GT.C[i]
  C=str(C).strip()
  GT['C'][i] = C
  CS=GT.CS[i]
  cs=list(set([str(c).strip() for c in CS]))
  if C not in cs:
    cs.append(C)
  GT.at[i,'CS']=cs

In [None]:
GT1=GT[GT.Domain==False]
GT2=GT[GT.Domain==True]

In [None]:
dset={'aerospace':['LunarRover','RosettaSystem'],
 'security':['Keepass_Reqs','EvidenceManagementOPENCOSS'],
 'defence':['HalifaxCombatSystems','DataItemDescriptions']}
 
domains={'DataItemDescriptions': 'defence',
 'EvidenceManagementOPENCOSS': 'security',
 'HalifaxCombatSystems': 'defence',
 'KeePass': 'security',
 'LunarRover': 'aerospace',
 'RosettaSystem': 'aerospace',
 'RosettaSystem1': 'aerospace',
 'RosettaSystem2': 'aerospace'}
 
rss={'RS1':'LunarRover',
 'RS2':'RosettaSystem',
 'RS3':'DataItemDescriptions',
 'RS4':'HalifaxCombatSystems',
 'RS5':'EvidenceManagementOPENCOSS',
 'RS6':'KeePass'
 }

In [None]:
def getTopK(di,K=50):
  tempdf=pd.DataFrame.from_dict(di,columns=["tfidf"], orient='index')
  return list(tempdf.sort_values(by=['tfidf'],ascending=False)[:K].index)
def getFileId(filename,dset=dset,domains=domains):
  return list(dset.keys()).index(domains[filename.split('.')[0]])

In [None]:
def buildTFIDFvector(docs,use_ngrams=True,ngrams=4):
  if use_ngrams:
    vectorizer = TfidfVectorizer(ngram_range=(1,ngrams),min_df=0,stop_words=stopwords.words('english'))
  else:
    vectorizer = TfidfVectorizer(min_df=0,stop_words=stopwords.words('english'))
  vectors = vectorizer.fit_transform(docs)
  return pd.DataFrame(vectors.todense().tolist(), columns=vectorizer.get_feature_names_out())
def buildTFIDF(files,domains,docs_path,use_ngrams=True,ngrams=3):
  docs={d:"" for d in domains}
  for d in domains:
    docs[d]=stemlemma(' '.join(openFiles([f+".txt" for f in domains[d]],docs_path)))
  return buildTFIDFvector(list(docs.values()),use_ngrams=use_ngrams,ngrams=ngrams)

In [None]:
dex=loadObj(path+'dexV2.obj')

In [None]:
def ContainsOrContainedString(s1,s2):
  s1=s1.replace(' ','')
  s2=s2.replace(' ','')
  return s1 in s2 or s2 in s1

In [None]:
def ContainsOrContainedList(s1,l):
  for s2 in l:
    if ContainsOrContainedString(s1,s2):
      return True
  return False

In [None]:
def alpha(s):
  return ''.join(e for e in s if e.isalpha())

In [None]:
def contains_sublist(lst, sublst):
    n = len(sublst)
    return any((sublst == lst[i:i+n]) for i in range(len(lst)-n+1))

# TFIDF

In [None]:
def getData(doc,GT):
  i=0
  corpus={}
  invcorpus={}
  queries={}
  qrels={}
  for c in list(GT[GT.Doc==doc].CS)[0]:
    corpus[str(i)]={'text':c, 'title':''}
    if c not in invcorpus.keys():
      invcorpus[c]=str(i)
    i+=1
  i=0
  for q in GT[GT.Doc==doc].Q.unique():
    queries[str(i)]=q
    c=GT[(GT.Doc==doc) & (GT.Q==q)]['C'].unique()[0]
    qrels[str(i)]={invcorpus[c]:1}
    i+=1
  return corpus,queries,qrels,invcorpus

## DocQ

In [None]:
def getCS(CS):
  li=[]
  for cs in CS:
    li.extend(cs)
  return set(li)

In [None]:
def getTFIDFscore(q,doc,icontext,tfidf):
  score=0
  for t in q.split():
    if t in tfidf[doc][q].columns:
      score+=tfidf[doc][q][t][icontext]
  return score

In [None]:
def getIR(GT,doc,tfidf):
  corpus,queries,qrels,invcorpus=getData(doc,GT)
  results={}
  for n,q in queries.items():
    doc_scores = [getTFIDFscore(q,doc,int(c),tfidf) for c in invcorpus.values()]
    results[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
  print(doc,"===============")
  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return ndcg, _map, recall, precision, 1 

In [None]:
def uniq(lst):
    last = object()
    for item in lst:
        if item == last:
            continue
        yield item
        last = item
def avg(lst):
    return sum(lst) / len(lst)
def sort_and_deduplicate(l):
    return list(uniq(sorted(l, reverse=True)))

In [None]:
from beir.retrieval.evaluation import EvaluateRetrieval
retriever = EvaluateRetrieval()

In [None]:
results={}
results['tfidf']={}
results['tfidf']['doc']={}

In [None]:
def buildTFIDF(GT):
  tfidf={}
  for doc in GT.Doc.unique():
    tfidf[doc]={}
  for i in GT.index:
    doc,q,cs=GT.Doc[i],GT.Q[i],GT.CS[i]
    tfidf[doc][q]=buildTFIDFvector(cs,use_ngrams=False)
  return tfidf

In [None]:
def buildTFIDFvFast(GT):
  tfidf={}
  for doc in GT.Doc.unique():
    tfidf[doc]={}
    cs = list(GT[GT.Doc==doc].CS)[0]
    vector=buildTFIDFvector(cs,use_ngrams=False)
    for q in GT[GT.Doc==doc].Q.unique():
      tfidf[doc][q]=vector
  return tfidf

In [None]:
tfidf=buildTFIDFvFast(GT1)
for doc in GT1.Doc.unique():
  start_time = time.time()
  ndcg,_,recall,_,_=getIR(GT1,doc,tfidf)
  results['tfidf']['doc'][doc]={}
  results['tfidf']['doc'][doc]['Recall']=recall
  results['tfidf']['doc'][doc]['NDCG']=ndcg
  extime = time.time() - start_time
  results['tfidf']['doc'][doc]['time']=extime



## DomQ

In [None]:
results['tfidf']['dom']={}

In [None]:
def getData_p(doc,dl=GT2):
  li=[]
  for C in dl[dl.Doc==doc].C.unique():
    i=0
    corpus={}
    invcorpus={}
    queries={}
    qrels={}
    for c in list(dl[(dl.Doc==doc) & (dl.C==C)].CS)[0]:
      corpus[str(i)]={'text':c, 'title':''}
      if c not in invcorpus.keys():
        invcorpus[c]=str(i)
      i+=1
    i=0
    for iq in dl[(dl.Doc==doc) & (dl.C==C)].index:
      q=dl.Q[iq]
      queries[str(i)]=q
      qrels[str(i)]={invcorpus[C]:1}
      i+=1
    li.append((corpus,queries,qrels,invcorpus,C))
  return li

In [None]:
def getIR_p(doc,tfidf,dl=GT2):
  lt=getData_p(doc,dl)
  scores={'MAP':{'MAP@1':[],'MAP@10':[],'MAP@100':[],'MAP@1000':[],'MAP@3':[],'MAP@5':[]},'NDCG':{'NDCG@1':[],'NDCG@10':[],'NDCG@100':[],'NDCG@1000':[],'NDCG@3':[],'NDCG@5':[]},'P':{'P@1':[],'P@10':[],'P@100':[],'P@1000':[],'P@3':[],'P@5':[]},'Recall':{'Recall@1':[],'Recall@10':[],'Recall@100':[],'Recall@1000':[],'Recall@3':[],'Recall@5':[]}}
  for t in lt:
    corpus,queries,qrels,invcorpus,C = t
    results={}
    for n,q in queries.items():
      doc_scores = [getTFIDFscore(q,doc,int(i),tfidf) for i in invcorpus.values()]
      results[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
    ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
    score_values={'NDCG':ndcg,'MAP':_map,'Recall':recall,'P':precision}
    for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i].append(score_values[m1][m1+'@'+i])
  print(doc,"===============")
  for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i] = avg(scores[m1][m1+'@'+i])
  return scores

In [None]:
tfidf_p=buildTFIDF(GT2)

In [None]:
for doc in GT2.Doc.unique():
  start_time = time.time()
  results['tfidf']['dom'][doc]={}
  scores=getIR_p(doc,tfidf_p,GT2)
  results['tfidf']['dom'][doc]['Recall']=scores['Recall']
  results['tfidf']['dom'][doc]['NDCG']=scores['NDCG']
  extime = time.time() - start_time
  results['tfidf']['dom'][doc]['time']=extime



## DR

In [None]:
dex=loadObj(path+"dexV2.obj")

In [None]:
def saveCorpus(docs,parent_dir,folder='Corpus'):
  for i in range(0,len(docs)):
    doc=docs[i]
    path = os.path.join(parent_dir, folder)
    if not os.path.exists(path):
      os.mkdir(path)
    filename='doc'+str(i)+'.txt'
    filepath = os.path.join(path, filename)
    text_file = open(filepath, "w")
    n = text_file.write(doc)
    text_file.close()

In [None]:
articles={}
for dom in dset:
  docs=[]
  for doc in dset[dom]:
    docs.extend([str(p) for p in dex[doc].Page.unique()])
    for article in GT[GT.Doc==doc].P.unique():
      if article not in docs:
        docs.append(str(article))
  docs=list(set(docs))
  articles[dom]=docs

In [None]:
for dom in dset:
  print(dom,len(articles[dom]))

aerospace 1158
security 50
defence 781


In [None]:
def getData_dr(dom,GT=GT2,articles=articles):
  i=0
  corpus={}
  invcorpus={}
  queries={}
  qrels={}
  for page in GT[GT.dom==dom].P.unique():
    if page not in articles[dom]:
      articles[dom].append(page)
  for ic in range(0,len(articles[dom])):
    c=articles[dom][ic]
    corpus[str(i)]={'text':c, 'title':''}
    if c not in invcorpus.keys():
      invcorpus[c]=str(i)
  i=0
  for q in GT[GT.dom==dom].Q.unique():
    queries[str(i)]=q
    c=GT[(GT.dom==dom) & (GT.Q==q)].P.unique()[0]
    qrels[str(i)]={invcorpus[c]:1}
    i+=1
  return corpus,queries,qrels,invcorpus

In [None]:
def getTFIDFscoreV0(q,doc,icontext,tfidf):
  score=0
  for t in q.split():
    if t in tfidf[doc].columns:
      score+=tfidf[doc][t][icontext]
  return score

In [None]:
def getIR_dr(dom,tfidf,GT=GT2):
  corpus,queries,qrels,invcorpus=getData_dr(dom,GT)
  results={}
  for n,q in queries.items():
    doc_scores = [getTFIDFscoreV0(q,dom,int(c),tfidf) for c in invcorpus.values()]
    results[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
  print(dom,"===============")
  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return ndcg, _map, recall, precision, 1 

In [None]:
tfidf_dr={}  
for dom in dset.keys():
  print(dom,len(articles[dom]))
  tfidf_dr[dom]=buildTFIDFvector(articles[dom],use_ngrams=False)

aerospace 1158
security 50
defence 781


In [None]:
results['tfidf']['dr']={}
for dom in dset.keys():
  start_time = time.time()
  ndcg,_,recall,_,_=getIR_dr(dom,tfidf_dr,GT2)
  results['tfidf']['dr'][dom]={}
  results['tfidf']['dr'][dom]['Recall']=recall
  results['tfidf']['dr'][dom]['NDCG']=ndcg
  extime = time.time() - start_time
  results['tfidf']['dr'][dom]['time']=extime



# BM25

## DocQ

In [None]:
results['bm25']={}
results['bm25']['doc']={}

In [None]:
from beir import util
import pathlib, os, random
from rank_bm25 import BM25Okapi

In [None]:
def getIRBM(doc,GT):
  start_time = time.time()
  corpus,queries,qrels,invcorpus=getData(doc,GT)
  tokenized_corpus = [doc.split(" ") for doc in invcorpus.keys()]
  bm25 = BM25Okapi(tokenized_corpus)
  results_={}
  for n,q in queries.items():
    tokenized_query = q.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    results_[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
  print(doc,"===============")
  ndcg, _, recall, _ = retriever.evaluate(qrels, results_, retriever.k_values)
  results['bm25']['doc'][doc]={}
  results['bm25']['doc'][doc]['Recall']=recall
  results['bm25']['doc'][doc]['NDCG']=ndcg
  extime = time.time() - start_time
  results['bm25']['doc'][doc]['time']=extime

In [None]:
start_time = time.time()
for doc in GT1.Doc.unique():
  getIRBM(doc,GT1)
print(time.time() - start_time)

0.4793243408203125


## DomQ

In [None]:
results['bm25']['dom']={}

In [None]:
def getIRBM_p(doc,GT):
  lt=getData_p(doc,GT)
  scores={'MAP':{'MAP@1':[],'MAP@10':[],'MAP@100':[],'MAP@1000':[],'MAP@3':[],'MAP@5':[]},'NDCG':{'NDCG@1':[],'NDCG@10':[],'NDCG@100':[],'NDCG@1000':[],'NDCG@3':[],'NDCG@5':[]},'P':{'P@1':[],'P@10':[],'P@100':[],'P@1000':[],'P@3':[],'P@5':[]},'Recall':{'Recall@1':[],'Recall@10':[],'Recall@100':[],'Recall@1000':[],'Recall@3':[],'Recall@5':[]}}
  for corpus,queries,qrels,invcorpus,C in lt:
    tokenized_corpus = [doc.split(" ") for doc in invcorpus.keys()]
    bm25 = BM25Okapi(tokenized_corpus)
    results={}
    for n,q in queries.items():
      tokenized_query = q.split(" ")
      doc_scores = bm25.get_scores(tokenized_query)
      results[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
    ndcg, _map, recall, precision=retriever.evaluate(qrels, results, retriever.k_values)
    score_values={'NDCG':ndcg,'MAP':_map,'Recall':recall,'P':precision}
    for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i].append(score_values[m1][m1+'@'+i])
  print(doc,"===============")
  for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i] = avg(scores[m1][m1+'@'+i])
  return scores

In [None]:
for doc in GT2.Doc.unique():
  start_time = time.time()
  results['bm25']['dom'][doc]={}
  scores=getIRBM_p(doc,GT2)
  results['bm25']['dom'][doc]['Recall']=scores['Recall']
  results['bm25']['dom'][doc]['NDCG']=scores['NDCG']
  extime = time.time() - start_time
  results['bm25']['dom'][doc]['time']=extime



## DR

In [None]:
results['bm25']['dr']={}

In [None]:
def getIRBM_dr(dom,GT=GT2):
  start_time = time.time()
  corpus,queries,qrels,invcorpus=getData_dr(dom,GT)
  tokenized_corpus = [doc.split(" ") for doc in invcorpus.keys()]
  bm25 = BM25Okapi(tokenized_corpus)
  results_={}
  for n,q in queries.items():
    tokenized_query = q.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    results_[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
  print(dom,"===============")
  ndcg,_,recall,_=retriever.evaluate(qrels, results_, retriever.k_values)
  results['bm25']['dr'][dom]={}
  results['bm25']['dr'][dom]['Recall']=recall
  results['bm25']['dr'][dom]['NDCG']=ndcg
  extime = time.time() - start_time
  results['bm25']['dr'][dom]['time']=extime

In [None]:
start_time = time.time()
for dom in dset.keys():
  getIRBM_dr(dom,GT2)
print(time.time() - start_time)

3.9744150638580322


# BiEncoder

## DocQ

In [None]:
start_time = time.time()
#!pip install tensorflow-text
from beir.retrieval import models
from beir.retrieval.search.sparse import SparseSearch
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

models={'MSdistildot':DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=128)}
print(time.time() - start_time)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
scoring={'Sparta':"cos_sim",
        'multiMPNET':"cos_sim",
        'allMPNET':"cos_sim",
        'MSdistilbert':"cos_sim",
        'MSroberta':"cos_sim",
        'MSdistildot':"dot"}

In [None]:
def rank(doc,model,scoreing="cos_sim",GT=GT1):
  retriever = EvaluateRetrieval(model, score_function=scoreing)#"cos_sim") # or "dot" for dot-product
  corpus,queries,qrels,invcorpus=getData(doc,GT)
  results = retriever.retrieve(corpus, queries)
  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return ndcg, _map, recall, precision, 1 

In [None]:
results['distilbert']={}
results['distilbert']['doc']={}

In [None]:
for doc in GT1.Doc.unique():
  start_time = time.time()
  model='MSdistildot'
  print(doc, model)
  ndcg,map,recall,precision,mrr=rank(doc,models[model],scoring[model])
  results['distilbert']['doc'][doc]={}
  results['distilbert']['doc'][doc]['Recall']=recall
  results['distilbert']['doc'][doc]['NDCG']=ndcg
  extime = time.time() - start_time
  results['distilbert']['doc'][doc]['time']=extime


DataItemDescriptions MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

EvidenceManagementOPENCOSS MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

LunarRover MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RosettaSystem MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

HalifaxCombatSystems MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

KeePass MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

## DomQ



In [None]:
def rank_p(doc,model,scoreing="cos_sim",GT=GT2):
  retriever = EvaluateRetrieval(model, score_function=scoreing)#"cos_sim") # or "dot" for dot-product
  lt=getData_p(doc,GT)
  scores={'MAP':{'MAP@1':[],'MAP@10':[],'MAP@100':[],'MAP@1000':[],'MAP@3':[],'MAP@5':[]},'NDCG':{'NDCG@1':[],'NDCG@10':[],'NDCG@100':[],'NDCG@1000':[],'NDCG@3':[],'NDCG@5':[]},'P':{'P@1':[],'P@10':[],'P@100':[],'P@1000':[],'P@3':[],'P@5':[]},'Recall':{'Recall@1':[],'Recall@10':[],'Recall@100':[],'Recall@1000':[],'Recall@3':[],'Recall@5':[]}}
  for corpus,queries,qrels,invcorpus,C in lt:
    results = retriever.retrieve(corpus, queries)
    ndcg, _map, recall, precision=retriever.evaluate(qrels, results, retriever.k_values)
    score_values={'NDCG':ndcg,'MAP':_map,'Recall':recall,'P':precision}
    for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i].append(score_values[m1][m1+'@'+i])
  print(doc,"===============")
  for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i] = avg(scores[m1][m1+'@'+i])
  return scores

In [None]:
results['distilbert']['dom']={}
for doc in GT2.Doc.unique():
  start_time = time.time()
  print(doc, 'MSdistildot')
  results['distilbert']['dom'][doc]={}  
  scores=rank_p(doc,models['MSdistildot'],scoring['MSdistildot'])
  results['distilbert']['dom'][doc]=scores
  extime = time.time() - start_time
  results['distilbert']['dom'][doc]['time']=extime

DataItemDescriptions MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

EvidenceManagementOPENCOSS MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

LunarRover MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RosettaSystem MSdistildot


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

IndexError: ignored

## DR

In [None]:
def rank_dr(dom,model,scoreing="cos_sim",GT=GT2):
  retriever = EvaluateRetrieval(model, score_function=scoreing)#"cos_sim") # or "dot" for dot-product
  corpus,queries,qrels,invcorpus=getData_dr(dom,GT)
  results = retriever.retrieve(corpus, queries)
  ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
  return ndcg, _map, recall, precision, 1 

In [None]:
results['distilbert']['dr']={}
for dom in dset.keys():
  start_time = time.time()
  model='MSdistildot'
  print(dom, model)
  ndcg, _, recall, _, _=rank_dr(dom,models[model],scoring[model],GT=GT2)
  results['distilbert']['dr'][dom]={}
  results['distilbert']['dr'][dom]['Recall']=recall
  results['distilbert']['dr'][dom]['NDCG']=ndcg
  extime = time.time() - start_time
  results['distilbert']['dr'][dom]['time']=extime

# Rerank

## DocQ

In [None]:
from beir.reranking.models import CrossEncoder
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

In [None]:
from beir.reranking import Rerank
reranker = Rerank(cross_encoder_model, batch_size=128)

In [None]:
def getBM25(doc,GT):
  corpus,queries,qrels,invcorpus=getData(doc,GT)
  tokenized_corpus = [doc.split(" ") for doc in invcorpus.keys()]
  bm25 = BM25Okapi(tokenized_corpus)
  results={}
  for n,q in queries.items():
    tokenized_query = q.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    results[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
  return corpus,queries,qrels,results

In [None]:
results['ce']={}
results['ce']['doc']={}

In [None]:
def rerankBMCE(doc,GT=GT1,k=10):
  start_time = time.time()
  corpus,queries,qrels,results_=getBM25(doc,GT)
  rerank_results = reranker.rerank(corpus, queries, results_, top_k=k)
  ndcg, _, recall, _ = EvaluateRetrieval.evaluate(qrels, rerank_results, retriever.k_values)
  results['ce']['doc'][doc]={}
  results['ce']['doc'][doc]['Recall']=recall
  results['ce']['doc'][doc]['NDCG']=ndcg
  extime = time.time() - start_time
  results['ce']['doc'][doc]['time']=extime

In [None]:
start_time = time.time()
for doc in GT1.Doc.unique():
  print(doc)
  rerankBMCE(doc)
print(time.time() - start_time)

## DomQ

In [None]:
def getBM25_p(corpus,queries,qrels,invcorpus):
  tokenized_corpus = [doc.split(" ") for doc in invcorpus.keys()]
  bm25 = BM25Okapi(tokenized_corpus)
  results={}
  for n,q in queries.items():
    tokenized_query = q.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    results[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
  return results

In [None]:
def rerankBMCE_p(doc,GT=GT2):
  lt=getData_p(doc,GT)
  scores={'MAP':{'MAP@1':[],'MAP@10':[],'MAP@100':[],'MAP@1000':[],'MAP@3':[],'MAP@5':[]},'NDCG':{'NDCG@1':[],'NDCG@10':[],'NDCG@100':[],'NDCG@1000':[],'NDCG@3':[],'NDCG@5':[]},'P':{'P@1':[],'P@10':[],'P@100':[],'P@1000':[],'P@3':[],'P@5':[]},'Recall':{'Recall@1':[],'Recall@10':[],'Recall@100':[],'Recall@1000':[],'Recall@3':[],'Recall@5':[]}}
  for corpus,queries,qrels,invcorpus,C in lt:
    results=getBM25_p(corpus,queries,qrels,invcorpus)
    rerank_results = reranker.rerank(corpus, queries, results, top_k=10)
    ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, rerank_results, retriever.k_values)
    score_values={'NDCG':ndcg,'MAP':_map,'Recall':recall,'P':precision}
    for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i].append(score_values[m1][m1+'@'+i])
  print(doc,"===============")
  for m1 in scores.keys():
      for i in ['1','3','5','10']:
        scores[m1][m1+'@'+i] = avg(scores[m1][m1+'@'+i])
  return scores

In [None]:
results['ce']['dom']={}
for doc in GT2.Doc.unique():
  start_time = time.time()
  print(doc)
  results['ce']['dom'][doc]={}  
  scores=rerankBMCE_p(doc)
  results['ce']['dom'][doc]=scores
  extime = time.time() - start_time
  results['ce']['dom'][doc]['time']=extime

## DR

In [None]:
def getBM25_dr(dom,GT):
  corpus,queries,qrels,invcorpus=getData_dr(dom,GT)
  tokenized_corpus = [doc.split(" ") for doc in invcorpus.keys()]
  bm25 = BM25Okapi(tokenized_corpus)
  results={}
  for n,q in queries.items():
    tokenized_query = q.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    results[n]={i:j for i,j in zip(invcorpus.values(),doc_scores)}
  return corpus,queries,qrels,results

In [None]:
results['ce']['dr']={}

In [None]:
def rerankBMCE_dr(dom,GT=GT2,k=10):
  start_time = time.time()
  corpus,queries,qrels,results_=getBM25_dr(dom,GT)
  rerank_results = reranker.rerank(corpus, queries, results_, top_k=k)
  ndcg, _, recall, _ = EvaluateRetrieval.evaluate(qrels, rerank_results, retriever.k_values)
  results['ce']['dr'][dom]={}
  results['ce']['dr'][dom]['Recall']=recall
  results['ce']['dr'][dom]['NDCG']=ndcg
  extime = time.time() - start_time
  results['ce']['dr'][dom]['time']=extime

In [None]:
start_time = time.time()
for dom in dset.keys():
  print(dom)
  rerankBMCE_dr(dom)
print(time.time() - start_time)

# Results

In [None]:
results

In [None]:
def export_results(results,k=None,Q=['doc','dom']):
  pdl=[]
  cols=['Document','Q','Method','K','Recall','NDCG','execution time']
  for doc in results['tfidf'][Q[0]].keys():
    for q in Q:
      for t in ['tfidf','bm25', 'distilbert', 'ce']:
        if doc!='KeePass' or q!='dom':
          for i in [1,3,5,10][:k]:
            li=[doc,q,t,i]
            li.append(str(results[t][q][doc]['Recall']['Recall@'+str(i)]))
            li.append(str(results[t][q][doc]['NDCG']['NDCG@'+str(i)]))
            li.append(str(results[t][q][doc]['time']))
            pdl.append(li)
  return pd.DataFrame(pdl,columns=cols)

In [None]:
export_results(results)

In [None]:
export_results(results,Q=['dr'])