<a href="https://colab.research.google.com/github/distiafajar28/Information-Retrieval/blob/main/Copy_of_bm25_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install tqdm

import nltk as nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from tqdm import tqdm
from collections import defaultdict, Counter
import numpy as np
import math as math



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


/content/sample_data/corpus.jsonl

In [25]:
import pandas as pd
import numpy as np

stop_words = set(stopwords.words('english'))
df = pd.DataFrame(pd.read_json('sample_data/corpus.jsonl', lines=True))
df.drop(columns=['metadata'], inplace=True)
corpus_tokens = {}

def tokenize(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return filtered_tokens

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    tokens = tokenize(row['text'])
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    corpus_tokens[row['_id']] = filtered_tokens

100%|██████████| 171332/171332 [03:16<00:00, 872.51it/s]


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
inverted_index = defaultdict(dict)
for doc_id, tokens in tqdm(corpus_tokens.items(), desc='Indexing...'):
    for term, frequency in Counter(tokens).items():
        inverted_index[term][doc_id] = frequency

Indexing...: 100%|██████████| 171332/171332 [00:17<00:00, 10036.06it/s]


In [29]:
docs_len = {}
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Calculating doc stats...'):
    docs_len[row['_id']] = len(corpus_tokens[row['_id']])

Calculating doc stats...: 100%|██████████| 171332/171332 [00:09<00:00, 18394.88it/s]


In [30]:
N = len(df)
average_dl = sum(docs_len.values()) / N

def bm25_score(term, doc_id, k1=0.50, b=0.75):
  if term not in inverted_index or doc_id not in inverted_index[term]:
    return 0.0

  tf = inverted_index[term][doc_id]
  dl = docs_len[doc_id]
  df = len(inverted_index[term])
  idf = math.log((N - df + 0.5) / (df + 0.5))
  denom = tf + k1 * (1 - b + b * dl / average_dl)
  score = idf * (tf * (k1 + 1) / denom)
  return score


In [31]:
query = 'serological tests for coronavirus'
query_tokens = tokenize(query)
union_docs = set().union(*(inverted_index[t].keys() for t in query_tokens))

scores = defaultdict(float)
for doc_id in tqdm(union_docs, desc='Calculating scores...'):
    score = sum(bm25_score(t, doc_id) for t in query_tokens)
    scores[doc_id] = score

sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
sorted_scores = sorted_scores[:50]
sorted_scores

Calculating scores...: 100%|██████████| 32364/32364 [00:00<00:00, 241672.77it/s]


[('upwn9o2m', 13.262579642878489),
 ('r1yf75bo', 12.93898716833149),
 ('8hrjgcas', 12.933873642886558),
 ('q2b4ig1h', 12.933873642886558),
 ('923jpec0', 12.924054121568695),
 ('3ea1ngo2', 12.73053316260355),
 ('8y0v6d2i', 12.69486123713582),
 ('hkvh2lc9', 12.525952883867507),
 ('1dbeh8q7', 12.434294183914886),
 ('px4fe7mn', 12.434294183914886),
 ('qjma4rsp', 12.368386360263894),
 ('9skvbk8m', 12.27905056986098),
 ('7ayg3typ', 12.204081839954071),
 ('rko7qdqk', 12.201964367381942),
 ('5jtzt8um', 12.161446626393719),
 ('84yjdlab', 12.08466273133778),
 ('82iy2prw', 12.08466273133778),
 ('0yj3xp9s', 12.065366536001829),
 ('0jl6qu0i', 12.065366536001829),
 ('m60w5dnl', 11.995399166047573),
 ('91872v0l', 11.995399166047573),
 ('vijh6x1l', 11.995399166047573),
 ('xw0o5ca7', 11.968254963975365),
 ('bj8wn9dh', 11.885209178111644),
 ('8cg5yj20', 11.868722638743765),
 ('cxt9oq0j', 11.777941772341551),
 ('ovlb53ek', 11.706928200266706),
 ('wf5cozst', 11.612425823688675),
 ('g693adjd', 11.598373773

In [32]:
qrel_data = pd.read_csv('sample_data/test.tsv', sep='\t')
qrel_dict = dict(zip(zip(qrel_data['query-id'].astype(str), qrel_data['corpus-id']), qrel_data['score']))

print("test.tsx loaded!")

test.tsx loaded!


In [33]:
def experiment(query, query_id: int, k1: float, b: float):
  query_tokens = tokenize(query)
  union_docs = set().union(*(inverted_index[t].keys() for t in query_tokens))

  scores = defaultdict(float)
  for doc_id in tqdm(union_docs, desc='Calculating scores...'):
    score = sum(bm25_score(t, doc_id, k1=k1, b=b) for t in query_tokens)
    scores[doc_id] = score

  sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:50]

  hasil_akhir = []
  for doc_id, score in sorted_scores:
      nilai_qrel = qrel_dict.get((str(query_id), doc_id), 0)

      hasil_akhir.append({
          'Doc ID': doc_id,
          'BM25 Score': score,
          'QRel': nilai_qrel
      })

  return pd.DataFrame(hasil_akhir)

In [34]:
experiment1 = experiment("coronavirus response to weather changes", "2", 0.25, 0.75)
experiment2 = experiment("coronavirus immunity", "3", 0.50, 1.0)
experiment3 = experiment("how do people die from the coronavirus", "4", 0.75, 0.25)
experiment4 = experiment("coronavirus test rapid testing", "6", 0.15, 0.45)
experiment5 = experiment("coronavirus under reporting", "8", 0.90, 0.50)

print("\n\n")
print(f"experiment1:\n{experiment1}")
print(f"experiment2:\n{experiment2}")
print(f"experiment3:\n{experiment3}")
print(f"experiment4:\n{experiment4}")
print(f"experiment5:\n{experiment5}")

Calculating scores...: 100%|██████████| 44442/44442 [00:00<00:00, 209021.24it/s]
Calculating scores...: 100%|██████████| 31077/31077 [00:00<00:00, 312791.29it/s]
Calculating scores...: 100%|██████████| 33870/33870 [00:00<00:00, 261349.23it/s]
Calculating scores...: 100%|██████████| 42180/42180 [00:00<00:00, 230182.51it/s]
Calculating scores...: 100%|██████████| 30236/30236 [00:00<00:00, 326182.17it/s]





experiment1:
      Doc ID  BM25 Score  QRel
0   a7yq1zu0   12.486632     0
1   flr9nmog   12.462307     0
2   aiwxlxzt   11.905408     2
3   3jth2pu4   11.498623     0
4   zvngy7zz   11.277682     2
5   0mikqjpj   11.273580     0
6   jc1k3fki   10.661111     1
7   9se0wm4t   10.607651     0
8   8ozauxlk   10.563279     0
9   k260c04b   10.557122     1
10  h5ufxzv9   10.557080     0
11  mmhkw2si   10.522428     0
12  g8grcy5j   10.475987     0
13  mja93ena   10.340837     0
14  hra8otj5   10.334557     1
15  cx3lf0dx   10.253515     2
16  39yvniki    9.953140     0
17  526elsrf    9.809370     2
18  kcvtvki1    9.500892     1
19  0p480zhb    9.440024     0
20  nhzamju7    9.427222     0
21  akb96git    9.348668     2
22  b0wd2jd3    9.323470     0
23  3hn0xkst    9.308086     0
24  iukudcbo    9.270041     2
25  weba7mr1    9.245177     1
26  vyfg6c9d    9.222830     0
27  uj8a09t3    9.187589     2
28  94xsyvux    9.124032     0
29  gdv1oip2    9.097921     1
30  26gf4q1v    9.03485