In [1]:
import compress_pickle

df = compress_pickle.load( "parsed_docs_lemmas.pkl.gz")

In [3]:
from rank_bm25 import BM25Okapi

idx = BM25Okapi(df["processed_text"].tolist())

In [23]:
import pickle 

pickle.dump(idx, open("saved_indices/bm250kapi_idx.pkl", "wb"))


In [5]:
q = "Telecom minister A. Raja resignation"
q = [tok.lower() for tok in q.split()]

In [103]:
import torch

num_results = 100

def get_rank25_results(query: str, idx, num_results:int = 25):
    scores = idx.get_scores(query)
    sorted_score_idx = np.argsort(scores)[::-1]

    results = {}
    for i in sorted_score_idx[:num_results]:
        docno = df.iloc[i]["docno"]
        score = scores[i].item()
        results[docno] = score
    return results    

In [32]:
from utils import read_qrel_from_file, evaluate_run

In [58]:
qrel = read_qrel_from_file("qrels/2020/cair2020_qrel.txt")

In [60]:
from bs4 import BeautifulSoup

fp = open("qrels/2020/topics_test.txt")
soup = BeautifulSoup(fp, 'xml')


In [68]:

def extract_topics_from_file(file: str) -> dict:
    soup = BeautifulSoup(open(file, "r"), 'xml')

    extracted_topics = []
    for topic in soup.find_all("top"):
        number = str(topic.num.text)
        title = topic.title.text
        narr = topic.narr.text
        extracted_topics.append({"number": number, "title": title, "narrative": narr})


In [45]:
import pytrec_eval
metrics: set = {'map', 'ndcg', 'recall'}

evaluator = pytrec_eval.RelevanceEvaluator(qrel, metrics)
evaluator.evaluate(run)


{'16': {'map': 0.21088205472964247,
  'recall_5': 0.058823529411764705,
  'recall_10': 0.11764705882352941,
  'recall_15': 0.14705882352941177,
  'recall_20': 0.17647058823529413,
  'recall_30': 0.23529411764705882,
  'recall_100': 0.5882352941176471,
  'recall_200': 0.5882352941176471,
  'recall_500': 0.5882352941176471,
  'recall_1000': 0.5882352941176471,
  'ndcg': 0.526916034973694}}

In [78]:
import spacy

nlp = spacy.load('en', disable=["tagger", "ner", "parser"])

def preprocess_query(text):
    cleaned_query = []
    for tok in nlp(text):
        tok = tok.lemma_.lower()
        if tok.isalpha():
            cleaned_query.append(tok)
    return cleaned_query

In [104]:
import tqdm.notebook as tqdm
title_run = {}
narr_run = {}

for topic in tqdm.tqdm(extracted_topics):
    title_toks = preprocess_query(topic["title"])
    narr_toks = preprocess_query(topic["narrative"])

    title_results = get_rank25_results(title_toks, idx, 50)
    narr_results = get_rank25_results(narr_toks, idx, 50)
    
    title_run[topic["number"]] = title_results
    narr_run[topic["number"]] = narr_results
print("finished")

  0%|          | 0/20 [00:00<?, ?it/s]

finished


In [113]:
import pytrec_eval
import numpy as np 
import json 

metrics: set = {'map', 'ndcg', 'P_5'}

evaluator = pytrec_eval.RelevanceEvaluator(qrel, metrics)
run_results = evaluator.evaluate(title_run)

map_scores = [v["map"] for k,v in run_results.items()]
p_scores  = [v["P_5"] for k,v in run_results.items()]

display(run_results)

print("Aggregate results")
print("Average MAP: ", np.mean(map_scores))
print("Average P_5: ", np.mean(p_scores))

with open("results/bm250kap-baseline/title-results.txt", "w") as f:
    f.write(f"Average MAP: {np.mean(map_scores)}\n")
    f.write(f"Average P_5: {np.mean(p_scores)}\n")
    f.write(f"Individual run results: \n")
    f.write(json.dumps(run_results, indent=4))

with open("results/bm250kap-baseline/title-run.txt", "w") as f:
    f.write(json.dumps(title_run, indent=4))

{'1': {'map': 0.0857992729838509, 'P_5': 0.4, 'ndcg': 0.24541403340349474},
 '2': {'map': 0.09291773146996271, 'P_5': 0.4, 'ndcg': 0.26489068160802665},
 '3': {'map': 0.21966240903604048, 'P_5': 0.2, 'ndcg': 0.5163460600226231},
 '4': {'map': 0.06638540431013955, 'P_5': 0.4, 'ndcg': 0.24807734635625595},
 '5': {'map': 0.4161339977285736, 'P_5': 1.0, 'ndcg': 0.5852790666091732},
 '6': {'map': 0.20315442597801556, 'P_5': 0.2, 'ndcg': 0.451153225278768},
 '7': {'map': 0.2505158547737754, 'P_5': 0.8, 'ndcg': 0.5015713626019195},
 '8': {'map': 0.45906334656334646, 'P_5': 0.4, 'ndcg': 0.7473829770969648},
 '9': {'map': 0.18853064242175385, 'P_5': 0.2, 'ndcg': 0.4168872426997385},
 '10': {'map': 0.08608322197031876, 'P_5': 0.4, 'ndcg': 0.2740905118763117},
 '11': {'map': 0.07576138370159298, 'P_5': 0.4, 'ndcg': 0.24505669724589946},
 '12': {'map': 0.40259024538508315, 'P_5': 0.6, 'ndcg': 0.6454313209816381},
 '13': {'map': 0.2676652326251622, 'P_5': 0.4, 'ndcg': 0.48800951596961595},
 '14': {

Aggregate results
Average MAP:  0.2561636552683072
Average P_5:  0.4999999999999999


In [110]:
import json

print(json.dumps(run_results, indent=4))

{
    "1": {
        "map": 0.24554547962813247,
        "P_5": 1.0,
        "ndcg": 0.43195244767143376
    },
    "2": {
        "map": 0.09316836010136732,
        "P_5": 0.2,
        "ndcg": 0.26286249092777214
    },
    "3": {
        "map": 0.6781836591938896,
        "P_5": 1.0,
        "ndcg": 0.8419504708828816
    },
    "4": {
        "map": 0.20881728355238408,
        "P_5": 0.6,
        "ndcg": 0.39788740206231976
    },
    "5": {
        "map": 0.30906608872874,
        "P_5": 1.0,
        "ndcg": 0.49363714531501374
    },
    "6": {
        "map": 0.21590003848037764,
        "P_5": 0.8,
        "ndcg": 0.46805865253277174
    },
    "7": {
        "map": 0.22486683527503215,
        "P_5": 0.8,
        "ndcg": 0.4615856394639282
    },
    "8": {
        "map": 0.025,
        "P_5": 0.2,
        "ndcg": 0.09478836436955078
    },
    "9": {
        "map": 0.49343012855076335,
        "P_5": 0.6,
        "ndcg": 0.7221873256895347
    },
    "10": {
        "map": 0.

In [114]:
run_results = evaluator.evaluate(narr_run)

map_scores = [v["map"] for k,v in run_results.items()]
p_scores  = [v["P_5"] for k,v in run_results.items()]

display(run_results)

print("Aggregate results")
print("Average MAP: ", np.mean(map_scores))
print("Average P_5: ", np.mean(p_scores))

with open("results/bm250kap-baseline/narrative-results.txt", "w") as f:
    f.write(f"Average MAP: {np.mean(map_scores)}\n")
    f.write(f"Average P_5: {np.mean(p_scores)}\n")
    f.write(f"Individual run results: \n")
    f.write(json.dumps(run_results, indent=4))

with open("results/bm250kap-baseline/narrative-run.txt", "w") as f:
    f.write(json.dumps(title_run, indent=4))

{'1': {'map': 0.24554547962813247, 'P_5': 1.0, 'ndcg': 0.43195244767143376},
 '2': {'map': 0.09316836010136732, 'P_5': 0.2, 'ndcg': 0.26286249092777214},
 '3': {'map': 0.6781836591938896, 'P_5': 1.0, 'ndcg': 0.8419504708828816},
 '4': {'map': 0.20881728355238408, 'P_5': 0.6, 'ndcg': 0.39788740206231976},
 '5': {'map': 0.30906608872874, 'P_5': 1.0, 'ndcg': 0.49363714531501374},
 '6': {'map': 0.21590003848037764, 'P_5': 0.8, 'ndcg': 0.46805865253277174},
 '7': {'map': 0.22486683527503215, 'P_5': 0.8, 'ndcg': 0.4615856394639282},
 '8': {'map': 0.025, 'P_5': 0.2, 'ndcg': 0.09478836436955078},
 '9': {'map': 0.49343012855076335, 'P_5': 0.6, 'ndcg': 0.7221873256895347},
 '10': {'map': 0.46085418124891814, 'P_5': 1.0, 'ndcg': 0.6665286161517171},
 '11': {'map': 0.03805272719690148, 'P_5': 0.0, 'ndcg': 0.15358268625049804},
 '12': {'map': 0.48891263513879235, 'P_5': 0.8, 'ndcg': 0.7262392552464187},
 '13': {'map': 0.24857027763549955, 'P_5': 0.4, 'ndcg': 0.45001064894167886},
 '14': {'map': 0.2

Aggregate results
Average MAP:  0.3285357429896214
Average P_5:  0.6399999999999999
