# Citation Recommendation on Scholarly Legal Articles

## BM-25

### Libraries

In [6]:
from rank_bm25 import BM25Okapi
import pickle
import os
from tqdm import tqdm

### Dataset

In [7]:
with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_docs.pkl', 'rb') as f:
    docs = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_queries.pkl', 'rb') as f:
    queries = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_data.pkl', 'rb') as f:
    pair = pickle.load(f)

### Train Model

In [8]:
tokenized_corpus = [doc.split() for doc in list(set(docs))]

bm25 = BM25Okapi(tokenized_corpus)

### Evaluate

#### 1. MAP

In [9]:
total_prec = 0
found = 0
for i in tqdm(range(len(queries))):

    sample = queries[i]
    tokenized_query = sample.split()
    results = bm25.get_top_n(tokenized_query, tokenized_corpus, n=10)

    count = 0
    precision = 0
    index = 0
    for m in results:
        if ' '.join(str(e) for e in m) in pair[i][1]:
            count += 1
            precision += count/(index+1)
        index += 1

    if count == 0:
        precision = 0
    else:
        found += 1
        precision /= count

    total_prec += precision

100%|██████████| 2675/2675 [02:23<00:00, 18.66it/s]


In [10]:
MAP = total_prec / len(queries)
print(MAP)

0.2603680462839343


#### 2. Recall

In [11]:
total_prec = 0
found = 0
for i in tqdm(range(len(queries))):

    sample = queries[i]
    tokenized_query = sample.split()
    results = bm25.get_top_n(tokenized_query, tokenized_corpus, n=10)

    count = 0
    for m in results:
        if ' '.join(str(e) for e in m) in pair[i][1]:
            count += 1

    total_prec += (count / len(pair[i][1]))

100%|██████████| 2675/2675 [02:25<00:00, 18.39it/s]


In [12]:
RECALL = total_prec / len(queries)
print(RECALL)

0.43170716510903434


#### 3. MRR

In [13]:
total_prec = 0
found = 0
for i in tqdm(range(len(queries))):

    sample = queries[i]
    tokenized_query = sample.split()
    results = bm25.get_top_n(tokenized_query, tokenized_corpus, n=10)

    index = 1
    for m in results:
        if ' '.join(str(e) for e in m) in pair[i][1]:
            break
        index += 1

    total_prec += (1/index)

100%|██████████| 2675/2675 [02:21<00:00, 18.89it/s]


In [14]:
MRR = total_prec / len(queries)
print(MRR)

0.3121275370527741


# Results

In [15]:
print("MAP@10: " + str(MAP))
print("Recall@10: " + str(RECALL))
print("MRR@10: " + str(MRR))

MAP@10: 0.2603680462839343
Recall@10: 0.43170716510903434
MRR@10: 0.3121275370527741
