## Load topics

In [19]:
import pandas as pd

path = "G:/python/anserini/src/main/resources/topics-and-qrels/"
topics_file = "topics.dl19-doc.txt"

topics = pd.read_csv(path+topics_file,sep="\t",names=["ID","Question"])
topics["ID"]= topics["ID"].astype(str)

topics.head()

Unnamed: 0,ID,Question
0,156493,do goldfish grow
1,1110199,what is wifi vs bluetooth
2,1063750,why did the us volunterilay enter ww1
3,130510,definition declaratory judgment
4,489204,right pelvic pain causes


## Experiments
Results file format from: http://www.rafaelglater.com/en/post/learn-how-to-use-trec_eval-to-evaluate-your-information-retrieval-system

In [60]:
from pyserini.search import SimpleSearcher

#searcher = SimpleSearcher('G:/python/anserini/lucene-index.robust04.pos+docvectors+rawdocs')
searcher = SimpleSearcher('C:/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs')

### BM25

In [65]:
searcher.set_bm25(0.9, 0.4)

f = open("runs/bm25.txt", "w") 
for (topicid,question) in zip(topics["ID"], topics["Question"]):
    hits = searcher.search(question)
    for i in range(0, 10):
        f.write(f'{topicid}\tQ0\t{hits[i].docid}\t{i+1:2}\t{hits[i].score:.5f}\tSTANDARD\n')   
f.close()

### BM25+RM3

In [21]:
searcher.set_bm25(0.9, 0.4)
searcher.set_rm3(10, 10, 0.5,rm3_output_query=True)

f = open("runs/bm25+rm3.txt", "w") 
for (topicid,question) in zip(topics["ID"], topics["Question"]):
    hits = searcher.search(question)
    for i in range(0, 10):
        f.write(f'{topicid}\tQ0\t{hits[i].docid}\t{i+1:2}\t{hits[i].score:.5f}\tSTANDARD\n')
f.close()

searcher.unset_rm3()

### Word2Vec QE

In [69]:
import gensim 
from gensim.models import Word2Vec
import gensim.downloader as api
import spacy
from pyserini.analysis import Analyzer, get_lucene_analyzer

path = "G:/data/glove.6B.300d.w2v.txt"
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary = False)
word_vectors = model.wv

nlp = spacy.load("en_core_web_sm")
analyzer = Analyzer(get_lucene_analyzer())

  word_vectors = model.wv


In [75]:
from pyserini.search import querybuilder as qb

f = open("runs/word2vec.txt", "w") 
for (topicid,question) in zip(topics["ID"], topics["Question"]):
    # Build the query
    builder = qb.get_boolean_query_builder()
    should = qb.JBooleanClauseOccur['should'].value # should occur
    print("'"+question+"' expansions:")
    for token in question.split(" "):
        if len(analyzer.analyze(token))>0:
            # add question token
            builder.add(qb.get_boost_query(qb.get_term_query(token), 1),should)
            
            # word2vec expansion
            if token in word_vectors.vocab and not token in nlp.Defaults.stop_words:
                token_lemma = nlp(token)[0].lemma_
                syns = model.most_similar(token)
                synonyms = [syn for (syn,score) in syns if nlp(syn)[0].lemma_ != token_lemma][:1]
                for synonym in synonyms:
                    if len(analyzer.analyze(synonym))>0:
                        # add query term
                        builder.add(qb.get_boost_query(qb.get_term_query(token), 0.1),should)
                        print("    "+token+" -> "+synonym)
            
    question = builder.build()
    print("\n")

    hits = searcher.search(question)
    for i in range(0, 10):
        f.write(f'{topicid}\tQ0\t{hits[i].docid}\t{i+1:2}\t{hits[i].score:.5f}\tSTANDARD\n')
f.close()

'do goldfish grow' expansions:
    goldfish -> koi
    grow -> growth


'what is wifi vs bluetooth' expansions:
    wifi -> wi-fi
    vs -> vs.
    bluetooth -> wifi


'why did the us volunterilay enter ww1' expansions:
    enter -> entry
    ww1 -> ww2


'definition declaratory judgment' expansions:
    definition -> defined
    declaratory -> injunctive
    judgment -> judgement


'right pelvic pain causes' expansions:
    right -> left
    pelvic -> abdominal
    pain -> discomfort
    causes -> disease


'what are the social determinants of health' expansions:
    social -> welfare
    determinants -> predictors
    health -> care


'how is the weather in jamaica' expansions:
    weather -> inclement
    jamaica -> barbados


'types of dysarthria from cerebral palsy' expansions:
    types -> kinds
    dysarthria -> ataxic
    cerebral -> palsy
    palsy -> cerebral


'who is robert gray' expansions:
    robert -> william
    gray -> grey


'what types of food can you cook sous vide