In [1]:
import pickle

df = pickle.load(open("data/parsed_docs_with_toks.pkl", "rb"))

In [2]:
df_filt = df.query("bd_news_flag != 1")

In [3]:
from rank_bm25 import BM25Okapi

idx = BM25Okapi(df_filt["tokens"].tolist())
print("finished")

finished


In [4]:
import pickle 

pickle.dump(idx, open("saved_indices/bm250kapi_idx.pkl", "wb"))
print("finished")


finished


In [5]:
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
import re
import swifter
import pandas as pd  
import pickle

In [6]:
import torch
import numpy as np 

def get_rank25_results(query: str, idx, num_results:int = 25):
    scores = idx.get_scores(query)
    sorted_score_idx = np.argsort(scores)[::-1]

    results = {}
    for i in sorted_score_idx[:num_results]:
        docno = df.iloc[i]["docno"]
        score = scores[i].item()
        results[docno] = score
    return results    

In [7]:
from utils import read_qrel_from_file, evaluate_run

In [8]:
qrel = read_qrel_from_file("qrels/2020/cair2020_qrel.txt")

In [9]:
from bs4 import BeautifulSoup

fp = open("qrels/2020/topics_test.txt")
soup = BeautifulSoup(fp, 'xml')
soup.find_all("top")

extracted_topics = []
for topic in soup.find_all("top"):
    number = str(topic.num.text)
    title = topic.title.text.strip()
    narr = topic.narr.text.strip()
    extracted_topics.append({"number": number, "title": title, "narrative": narr})

In [10]:
def extract_topics_from_file(file: str) -> dict:
    soup = BeautifulSoup(open(file, "r"), 'xml')

    extracted_topics = []
    for topic in soup.find_all("top"):
        number = str(topic.num.text)
        title = topic.title.text.strip()
        narr = topic.narr.text.strip()
        extracted_topics.append({"number": number, "title": title, "narrative": narr})    
    return pd.DataFrame(extracted_topics)

topics = extract_topics_from_file("qrels/2020/topics_test.txt")

topics

Unnamed: 0,number,title,narrative
0,1,Assassination of Osama-bin-laden,Relevant document should contain information a...
1,2,Accused Ajmal Kasab,Information on Kasab's confession about 26/11 ...
2,3,Maharashtra CM ashok chavan resignation,Documents about Ashok Chavan's deliberate endi...
3,4,Accused Sanjay Dutt,Relevant document would highlight that actor S...
4,5,Abu Salem accused,Relevants documents will contain evidences on ...
5,6,Babri Masjid demolition case against Advani,A relevant document must include information a...
6,7,CBI searches Dawood Ibrahim,A relevant document must provide information r...
7,8,Court blocks facebook in pakistan,Relevant documents should contain information ...
8,9,Jaswant Singh BJP sacking,Relevant documents should contain information ...
9,10,Kasab's nationality confirmed,A relevant document must provide information o...


In [12]:
import pytrec_eval
metrics: set = {'map', 'ndcg', 'recall'}

evaluator = pytrec_eval.RelevanceEvaluator(qrel, metrics)

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "ner"])

def preprocess_query(text):
    clean_text = re.sub(r'[^\w\s]', '', text.lower())
    
    toks = []
    for tok in nlp(clean_text):
        if tok.text not in STOP_WORDS and tok.text.strip() != "":
            toks.append(tok.lemma_)
    return toks


In [14]:
import tqdm.notebook as tqdm
import numpy as np
title_run = {}
narr_run = {}

for topic in tqdm.tqdm(extracted_topics):
    title_toks = preprocess_query(topic["title"])
    narr_toks = preprocess_query(topic["narrative"])

    title_results = get_rank25_results(title_toks, idx, 50)
    narr_results = get_rank25_results(narr_toks, idx, 50)
    
    title_run[topic["number"]] = title_results
    narr_run[topic["number"]] = narr_results
print("finished")

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


finished


In [15]:
import pytrec_eval
import numpy as np 
import json 

metrics: set = {'map', 'ndcg', 'P_5'}

evaluator = pytrec_eval.RelevanceEvaluator(qrel, metrics)
run_results = evaluator.evaluate(title_run)

map_scores = [v["map"] for k,v in run_results.items()]
p_scores  = [v["P_5"] for k,v in run_results.items()]

display(run_results)

print("Aggregate results")
print("Average MAP: ", np.mean(map_scores))
print("Average P_5: ", np.mean(p_scores))


{'1': {'map': 0.0002173913043478261, 'P_5': 0.0, 'ndcg': 0.00893619808537653},
 '2': {'map': 0.1259321200762225, 'P_5': 0.4, 'ndcg': 0.3328720689537768},
 '3': {'map': 0.22893544699514085, 'P_5': 0.2, 'ndcg': 0.5232275961284083},
 '4': {'map': 0.25274171815412877, 'P_5': 0.4, 'ndcg': 0.4739366864936738},
 '5': {'map': 0.4046342499193968, 'P_5': 0.8, 'ndcg': 0.5790681726821794},
 '6': {'map': 0.19284874240968863, 'P_5': 0.2, 'ndcg': 0.4362583005440685},
 '7': {'map': 0.30226092589700965, 'P_5': 0.8, 'ndcg': 0.5439887541679621},
 '8': {'map': 0.7347527472527473, 'P_5': 1.0, 'ndcg': 0.8495337194516893},
 '9': {'map': 0.16816772173308675, 'P_5': 0.6, 'ndcg': 0.37391337768044675},
 '10': {'map': 0.18606261894305373, 'P_5': 0.4, 'ndcg': 0.4037645750484759},
 '11': {'map': 0.11821294755705763, 'P_5': 0.4, 'ndcg': 0.27826340604050354},
 '12': {'map': 0.19652834072514383, 'P_5': 0.4, 'ndcg': 0.4521563559912647},
 '13': {'map': 0.46021409152712717, 'P_5': 1.0, 'ndcg': 0.6479221997264277},
 '14':

Aggregate results
Average MAP:  0.28879818282597614
Average P_5:  0.53


In [113]:
import pytrec_eval
import numpy as np 
import json 

metrics: set = {'map', 'ndcg', 'P_5'}

evaluator = pytrec_eval.RelevanceEvaluator(qrel, metrics)
run_results = evaluator.evaluate(title_run)

map_scores = [v["map"] for k,v in run_results.items()]
p_scores  = [v["P_5"] for k,v in run_results.items()]

display(run_results)

print("Aggregate results")
print("Average MAP: ", np.mean(map_scores))
print("Average P_5: ", np.mean(p_scores))

# with open("results/bm250kap-baseline/title-results.txt", "w") as f:
#     f.write(f"Average MAP: {np.mean(map_scores)}\n")
#     f.write(f"Average P_5: {np.mean(p_scores)}\n")
#     f.write(f"Individual run results: \n")
#     f.write(json.dumps(run_results, indent=4))

# with open("results/bm250kap-baseline/title-run.txt", "w") as f:
#     f.write(json.dumps(title_run, indent=4))

{'1': {'map': 0.0857992729838509, 'P_5': 0.4, 'ndcg': 0.24541403340349474},
 '2': {'map': 0.09291773146996271, 'P_5': 0.4, 'ndcg': 0.26489068160802665},
 '3': {'map': 0.21966240903604048, 'P_5': 0.2, 'ndcg': 0.5163460600226231},
 '4': {'map': 0.06638540431013955, 'P_5': 0.4, 'ndcg': 0.24807734635625595},
 '5': {'map': 0.4161339977285736, 'P_5': 1.0, 'ndcg': 0.5852790666091732},
 '6': {'map': 0.20315442597801556, 'P_5': 0.2, 'ndcg': 0.451153225278768},
 '7': {'map': 0.2505158547737754, 'P_5': 0.8, 'ndcg': 0.5015713626019195},
 '8': {'map': 0.45906334656334646, 'P_5': 0.4, 'ndcg': 0.7473829770969648},
 '9': {'map': 0.18853064242175385, 'P_5': 0.2, 'ndcg': 0.4168872426997385},
 '10': {'map': 0.08608322197031876, 'P_5': 0.4, 'ndcg': 0.2740905118763117},
 '11': {'map': 0.07576138370159298, 'P_5': 0.4, 'ndcg': 0.24505669724589946},
 '12': {'map': 0.40259024538508315, 'P_5': 0.6, 'ndcg': 0.6454313209816381},
 '13': {'map': 0.2676652326251622, 'P_5': 0.4, 'ndcg': 0.48800951596961595},
 '14': {

Aggregate results
Average MAP:  0.2561636552683072
Average P_5:  0.4999999999999999


In [16]:
run_results = evaluator.evaluate(narr_run)

map_scores = [v["map"] for k,v in run_results.items()]
p_scores  = [v["P_5"] for k,v in run_results.items()]

display(run_results)

print("Aggregate results")
print("Average MAP: ", np.mean(map_scores))
print("Average P_5: ", np.mean(p_scores))

{'1': {'map': 0.05072817534200687, 'P_5': 0.4, 'ndcg': 0.18559757911380276},
 '2': {'map': 0.19134433607952664, 'P_5': 0.6, 'ndcg': 0.3890973191680332},
 '3': {'map': 0.7248846274136617, 'P_5': 1.0, 'ndcg': 0.9081052909699342},
 '4': {'map': 0.3041592311343593, 'P_5': 1.0, 'ndcg': 0.5069687016674774},
 '5': {'map': 0.2932776401822263, 'P_5': 1.0, 'ndcg': 0.473881032391106},
 '6': {'map': 0.25313361055164224, 'P_5': 0.6, 'ndcg': 0.5334326427178282},
 '7': {'map': 0.334377890247673, 'P_5': 0.8, 'ndcg': 0.5678092156120251},
 '8': {'map': 0.017151162790697675, 'P_5': 0.0, 'ndcg': 0.10974526590185513},
 '9': {'map': 0.5130935255369395, 'P_5': 0.6, 'ndcg': 0.7337893640575607},
 '10': {'map': 0.4753910086068267, 'P_5': 1.0, 'ndcg': 0.689388811610196},
 '11': {'map': 0.12167773609351289, 'P_5': 0.6, 'ndcg': 0.3086841437844016},
 '12': {'map': 0.38118848969212715, 'P_5': 0.4, 'ndcg': 0.6317121716627786},
 '13': {'map': 0.47794434367505584, 'P_5': 1.0, 'ndcg': 0.6439862076258248},
 '14': {'map':

Aggregate results
Average MAP:  0.3531421703707428
Average P_5:  0.6799999999999999


In [36]:
run_results = evaluator.evaluate(narr_run)

map_scores = [v["map"] for k,v in run_results.items()]
p_scores  = [v["P_5"] for k,v in run_results.items()]

display(run_results)

print("Aggregate results")
print("Average MAP: ", np.mean(map_scores))
print("Average P_5: ", np.mean(p_scores))

{'1': {'map': 0.04402549433441882, 'P_5': 0.4, 'ndcg': 0.17227327297591483},
 '2': {'map': 0.1868069134882157, 'P_5': 0.6, 'ndcg': 0.3866051581702334},
 '3': {'map': 0.7272447414197384, 'P_5': 1.0, 'ndcg': 0.9088433466673372},
 '4': {'map': 0.31381425763566484, 'P_5': 1.0, 'ndcg': 0.5202545637719598},
 '5': {'map': 0.28162893887628765, 'P_5': 1.0, 'ndcg': 0.46215507858247157},
 '6': {'map': 0.24250418514395317, 'P_5': 0.6, 'ndcg': 0.5136410432069678},
 '7': {'map': 0.268405442651054, 'P_5': 0.8, 'ndcg': 0.49828958478148627},
 '8': {'map': 0.0047619047619047615, 'P_5': 0.0, 'ndcg': 0.04935421935348238},
 '9': {'map': 0.4642077787980886, 'P_5': 0.4, 'ndcg': 0.6835174615774817},
 '10': {'map': 0.44197706381403146, 'P_5': 1.0, 'ndcg': 0.6589136441994532},
 '11': {'map': 0.07705919166748026, 'P_5': 0.6, 'ndcg': 0.24716036726857735},
 '12': {'map': 0.3827854210316024, 'P_5': 0.4, 'ndcg': 0.6323273456684958},
 '13': {'map': 0.27428235399992373, 'P_5': 0.2, 'ndcg': 0.4639665499009099},
 '14': 

Aggregate results
Average MAP:  0.32489416116275116
Average P_5:  0.62


In [114]:
run_results = evaluator.evaluate(narr_run)

map_scores = [v["map"] for k,v in run_results.items()]
p_scores  = [v["P_5"] for k,v in run_results.items()]

display(run_results)

print("Aggregate results")
print("Average MAP: ", np.mean(map_scores))
print("Average P_5: ", np.mean(p_scores))

with open("results/bm250kap-baseline/narrative-results.txt", "w") as f:
    f.write(f"Average MAP: {np.mean(map_scores)}\n")
    f.write(f"Average P_5: {np.mean(p_scores)}\n")
    f.write(f"Individual run results: \n")
    f.write(json.dumps(run_results, indent=4))

with open("results/bm250kap-baseline/narrative-run.txt", "w") as f:
    f.write(json.dumps(title_run, indent=4))

{'1': {'map': 0.24554547962813247, 'P_5': 1.0, 'ndcg': 0.43195244767143376},
 '2': {'map': 0.09316836010136732, 'P_5': 0.2, 'ndcg': 0.26286249092777214},
 '3': {'map': 0.6781836591938896, 'P_5': 1.0, 'ndcg': 0.8419504708828816},
 '4': {'map': 0.20881728355238408, 'P_5': 0.6, 'ndcg': 0.39788740206231976},
 '5': {'map': 0.30906608872874, 'P_5': 1.0, 'ndcg': 0.49363714531501374},
 '6': {'map': 0.21590003848037764, 'P_5': 0.8, 'ndcg': 0.46805865253277174},
 '7': {'map': 0.22486683527503215, 'P_5': 0.8, 'ndcg': 0.4615856394639282},
 '8': {'map': 0.025, 'P_5': 0.2, 'ndcg': 0.09478836436955078},
 '9': {'map': 0.49343012855076335, 'P_5': 0.6, 'ndcg': 0.7221873256895347},
 '10': {'map': 0.46085418124891814, 'P_5': 1.0, 'ndcg': 0.6665286161517171},
 '11': {'map': 0.03805272719690148, 'P_5': 0.0, 'ndcg': 0.15358268625049804},
 '12': {'map': 0.48891263513879235, 'P_5': 0.8, 'ndcg': 0.7262392552464187},
 '13': {'map': 0.24857027763549955, 'P_5': 0.4, 'ndcg': 0.45001064894167886},
 '14': {'map': 0.2

Aggregate results
Average MAP:  0.3285357429896214
Average P_5:  0.6399999999999999
