## Load topics

In [1]:
import pandas as pd

#path = "G:/python/anserini/src/main/resources/topics-and-qrels/"
path = "C:/Users/Thijs-Jan-Luttikholt/Documents/Study/Master/Year 1/Information retrieval/anserini/src/main/resources/topics-and-qrels/"
topics_file = "topics.dl19-doc.txt"

topics = pd.read_csv(path+topics_file,sep="\t",names=["ID","Question"])
topics["ID"]= topics["ID"].astype(str)

topics.head()

Unnamed: 0,ID,Question
0,156493,do goldfish grow
1,1110199,what is wifi vs bluetooth
2,1063750,why did the us volunterilay enter ww1
3,130510,definition declaratory judgment
4,489204,right pelvic pain causes


## Experiments
Results file format from: http://www.rafaelglater.com/en/post/learn-how-to-use-trec_eval-to-evaluate-your-information-retrieval-system

In [12]:
from pyserini.search import SimpleSearcher

#searcher = SimpleSearcher('G:/python/anserini/lucene-index.robust04.pos+docvectors+rawdocs')
#searcher = SimpleSearcher('C:/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs')
searcher = SimpleSearcher('C:/Users/Thijs-Jan-Luttikholt/Documents/Study/Master/Year 1/Information retrieval/Project/lucene-index.msmarco-doc.pos+docvectors+rawdocs/lucene-index.msmarco-doc.pos+docvectors+rawdocs')

### BM25

In [None]:
searcher.set_bm25(0.9, 0.4)

f = open("runs/bm25.txt", "w") 
for (topicid,question) in zip(topics["ID"], topics["Question"]):
    hits = searcher.search(question)
    for i in range(0, 10):
        f.write(f'{topicid}\tQ0\t{hits[i].docid}\t{i+1:2}\t{hits[i].score:.5f}\tSTANDARD\n')   
f.close()

### BM25+RM3

In [21]:
searcher.set_bm25(0.9, 0.4)
searcher.set_rm3(10, 10, 0.5,rm3_output_query=True)

f = open("runs/bm25+rm3.txt", "w") 
for (topicid,question) in zip(topics["ID"], topics["Question"]):
    hits = searcher.search(question)
    for i in range(0, 10):
        f.write(f'{topicid}\tQ0\t{hits[i].docid}\t{i+1:2}\t{hits[i].score:.5f}\tSTANDARD\n')
f.close()

searcher.unset_rm3()

### Word2Vec QE

In [53]:
import gensim 
from gensim.models import Word2Vec
import gensim.downloader as api
import spacy
from pyserini.analysis import Analyzer, get_lucene_analyzer

path = "G:/data/glove.6B.300d.w2v.txt"
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary = False)
word_vectors = model.wv

nlp = spacy.load("en_core_web_sm")
analyzer = Analyzer(get_lucene_analyzer())

ModuleNotFoundError: No module named 'spacy'

In [75]:
from pyserini.search import querybuilder as qb

f = open("runs/word2vec.txt", "w") 
for (topicid,question) in zip(topics["ID"], topics["Question"]):
    # Build the query
    builder = qb.get_boolean_query_builder()
    should = qb.JBooleanClauseOccur['should'].value # should occur
    print("'"+question+"' expansions:")
    for token in question.split(" "):
        if len(analyzer.analyze(token))>0:
            # add question token
            builder.add(qb.get_boost_query(qb.get_term_query(token), 1),should)
            
            # word2vec expansion
            if token in word_vectors.vocab and not token in nlp.Defaults.stop_words:
                token_lemma = nlp(token)[0].lemma_
                syns = model.most_similar(token)
                synonyms = [syn for (syn,score) in syns if nlp(syn)[0].lemma_ != token_lemma][:1]
                for synonym in synonyms:
                    if len(analyzer.analyze(synonym))>0:
                        # add query term
                        builder.add(qb.get_boost_query(qb.get_term_query(token), 0.1),should)
                        print("    "+token+" -> "+synonym)
            
    question = builder.build()
    print("\n")

    hits = searcher.search(question)
    for i in range(0, 10):
        f.write(f'{topicid}\tQ0\t{hits[i].docid}\t{i+1:2}\t{hits[i].score:.5f}\tSTANDARD\n')
f.close()

'do goldfish grow' expansions:
    goldfish -> koi
    grow -> growth


'what is wifi vs bluetooth' expansions:
    wifi -> wi-fi
    vs -> vs.
    bluetooth -> wifi


'why did the us volunterilay enter ww1' expansions:
    enter -> entry
    ww1 -> ww2


'definition declaratory judgment' expansions:
    definition -> defined
    declaratory -> injunctive
    judgment -> judgement


'right pelvic pain causes' expansions:
    right -> left
    pelvic -> abdominal
    pain -> discomfort
    causes -> disease


'what are the social determinants of health' expansions:
    social -> welfare
    determinants -> predictors
    health -> care


'how is the weather in jamaica' expansions:
    weather -> inclement
    jamaica -> barbados


'types of dysarthria from cerebral palsy' expansions:
    types -> kinds
    dysarthria -> ataxic
    cerebral -> palsy
    palsy -> cerebral


'who is robert gray' expansions:
    robert -> william
    gray -> grey


'what types of food can you cook sous vide

## Wordnet QE

In [2]:
import nltk
from nltk.corpus import wordnet as wn 
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
from nltk.corpus import words
from operator import itemgetter
from pyserini.search import querybuilder as qb
from pyserini.analysis import Analyzer, get_lucene_analyzer
import spacy

In [7]:
nltk.download('words')

[nltk_data] Downloading package words to C:\Users\Thijs-Jan-
[nltk_data]     Luttikholt\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [3]:
#The following function creates a set containing synonyms for each word in the input list.
def add_synonyms(myList):
    synonyms = []
    for word in myList:
        for syn in wn.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    return set(synonyms)

#The following function creates a set containing hypernyms for each word in the input list.
def add_hypernyms(myList):
    hypernyms = []
    for word in myList:
        for syn in wn.synsets(word):
            for h in syn.hypernyms():
                #hypernyms.append(h.name().split('.')[0])
                hypernyms.append(h.name())
    return set(hypernyms)

#The following function creates a set containing hyponyms for each word in the input list.
def add_hyponyms(myList):
    hyponyms = []
    for word in myList:
        for syn in wn.synsets(word):
            for h in syn.hyponyms():
                hyponyms.append(h.name().split('.')[0])
    return set(hyponyms)

#The following function creates a set containing meronyms for each word in the input list.
def add_meronyms(myList):
    meronyms = []
    for word in myList:
        for syn in wn.synsets(word):
            for m in syn.part_meronyms():
                meronyms.append(m.name().split('.')[0])
    return set(meronyms)

#The following function creates a set containing holonyms for each word in the input list.
def add_holonyms(myList):
    holonyms = []
    for word in myList:
        for syn in wn.synsets(word):
            for m in syn.part_holonyms():
                holonyms.append(m.name().split('.')[0])
    return set(holonyms)

In [25]:
def best_synonyms(original, synonyms, max_len, min_sim):
    syns_scores = []
    orig_set = wn.synsets(original)[0]
    set_type = orig_set.name().split('.')[1]
    
    i = 1
    for word in synonyms:
        word_sets = wn.synsets(word)
        word_set = None
        for item in word_sets:
            if item.name().split('.')[1] is set_type:
                word_set = item
                break
        if word_set is None:
            continue
        score = orig_set.path_similarity(word_set) 
        if score is None:
            continue
        if score >= min_sim:
            if len(syns_scores) < max_len:
                syns_scores.append([word,score])
            elif len(syns_scores) == max_len and score > syns_scores[max_len-1][1]:
                syns_scores[max_len-1] = [word,score]
            syns_scores = sorted(syns_scores, key=itemgetter(1))
    final_syns = [word for word,score in syns_scores if word != original]
    return final_syns

def filter_words(myList):
    new_list = []
    for item in myList:
        if '_' not in item:
            new_list.append(item)
    return new_list


best_syns = best_synonyms('dog', add_synonyms(['dog']), 100, 0.2)
print(filter_words(best_syns))

['hound']


In [10]:
analyzer = Analyzer(get_lucene_analyzer())
nlp = spacy.load("en_core_web_sm")

In [26]:
f = open("runs/wordnet.txt", "w") 
for (topicid,question) in zip(topics["ID"], topics["Question"]):
    # Build the query
    builder = qb.get_boolean_query_builder()
    should = qb.JBooleanClauseOccur['should'].value # should occur
    print("'"+question+"' expansions:")
    for token in question.split(" "):
        if len(analyzer.analyze(token))>0:
            # add question token
            builder.add(qb.get_boost_query(qb.get_term_query(token), 1),should)
            
            # wordnet expansion
            if token not in nlp.Defaults.stop_words and token in words.words():
                token_lemma = nlp(token)[0].lemma_
                
                wordnet_synonyms = add_synonyms([token])
                synonyms = best_synonyms(token, wordnet_synonyms, 3, 0.2)
                synonyms = filter_words(synonyms)
                for synonym in synonyms:
                    if len(analyzer.analyze(synonym))>0:
                        # add query term
                        builder.add(qb.get_boost_query(qb.get_term_query(token), 0.1),should)
                        print("    "+token+" -> "+synonym)
            
    question = builder.build()
    print("\n")

    hits = searcher.search(question)
    for i in range(0, 10):
        f.write(f'{topicid}\tQ0\t{hits[i].docid}\t{i+1:2}\t{hits[i].score:.5f}\tSTANDARD\n')
f.close()

'do goldfish grow' expansions:
    grow -> originate
    grow -> uprise


'what is wifi vs bluetooth' expansions:


'why did the us volunterilay enter ww1' expansions:
    enter -> figure


'definition declaratory judgment' expansions:
    declaratory -> declarative
    declaratory -> asserting
    judgment -> opinion


'right pelvic pain causes' expansions:
    pain -> hurting


'what are the social determinants of health' expansions:
    social -> sociable
    social -> mixer
    health -> wellness


'how is the weather in jamaica' expansions:


'types of dysarthria from cerebral palsy' expansions:
    palsy -> paralysis


'who is robert gray' expansions:
    gray -> grayness
    gray -> Gray


'what types of food can you cook sous vide' expansions:
    food -> nutrient
    cook -> Cook


'how long is life cycle of flea' expansions:
    long -> hanker
    long -> yearn
    life -> aliveness
    life -> animation


'what can contour plowing reduce' expansions:
    plowing -> ploughing