# Citation Recommendation on Scholarly Legal Articles

## BM-25 + Fine-tuned SPECTER

### Libraries

In [1]:
from rank_bm25 import BM25L
import pickle
import os
from tqdm import tqdm
from sklearn.metrics import pairwise

### Dataset

In [2]:
with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_docs.pkl', 'rb') as f:
    docs = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_queries.pkl', 'rb') as f:
    queries = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_data.pkl', 'rb') as f:
    pair = pickle.load(f)

In [3]:
with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST/LATEST_specter_inference_docs.pkl', 'rb') as f:
    parent_embeddings_list = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST/LATEST_specter_inference_queries.pkl', 'rb') as f:
    child_embeddings_list = pickle.load(f)

### Train Model

In [4]:
tokenized_corpus = [doc.split() for doc in list(set(docs))]

bm25 = BM25L(tokenized_corpus)

### Evaluate

#### 1. MAP

In [5]:
total_prec = 0
found = 0
for i in tqdm(range(len(queries))):

    sample = queries[i]
    tokenized_query = sample.split()
    results = bm25.get_top_n(tokenized_query, tokenized_corpus, n=10)

    result_emb = []

    for x in results:
        result_emb.append(parent_embeddings_list[docs.index(' '.join(str(e) for e in x))])

    cos_matrix = pairwise.cosine_similarity(child_embeddings_list[i].reshape(1, -1), result_emb)
    retrieved = list(sorted(enumerate(cos_matrix[0]), key=lambda item: item[1], reverse=True))

    count = 0
    precision = 0
    index = 0
    for m in retrieved:
        if ' '.join(str(e) for e in results[m[0]]) in pair[i][1]:
            count += 1
            precision += count/(index+1)
        index += 1

    if count == 0:
        precision = 0
    else:
        found += 1
        precision /= count

    total_prec += precision

100%|██████████| 2675/2675 [02:28<00:00, 18.03it/s]


In [6]:
MAP = total_prec / len(queries)
print(MAP)

0.19883865895267738


#### 2. Recall

In [7]:
total_prec = 0
found = 0
for i in tqdm(range(len(queries))):

    sample = queries[i]
    tokenized_query = sample.split()
    results = bm25.get_top_n(tokenized_query, tokenized_corpus, n=10)

    result_emb = []

    for x in results:
        result_emb.append(parent_embeddings_list[docs.index(' '.join(str(e) for e in x))])

    cos_matrix = pairwise.cosine_similarity(child_embeddings_list[i].reshape(1, -1), result_emb)
    retrieved = list(sorted(enumerate(cos_matrix[0]), key=lambda item: item[1], reverse=True))

    count = 0
    for m in retrieved:
        if ' '.join(str(e) for e in results[m[0]]) in pair[i][1]:
            count += 1

    total_prec += (count / len(pair[i][1]))

100%|██████████| 2675/2675 [02:26<00:00, 18.30it/s]


In [8]:
RECALL = total_prec / len(queries)
print(RECALL)

0.33382242990654215


#### 3. MRR

In [9]:
total_prec = 0
found = 0
for i in tqdm(range(len(queries))):

    sample = queries[i]
    tokenized_query = sample.split()
    results = bm25.get_top_n(tokenized_query, tokenized_corpus, n=10)

    result_emb = []

    for x in results:
        result_emb.append(parent_embeddings_list[docs.index(' '.join(str(e) for e in x))])

    cos_matrix = pairwise.cosine_similarity(child_embeddings_list[i].reshape(1, -1), result_emb)
    retrieved = list(sorted(enumerate(cos_matrix[0]), key=lambda item: item[1], reverse=True))

    index = 1
    for m in retrieved:
        if ' '.join(str(e) for e in results[m[0]]) in pair[i][1]:
            break
        index += 1

    total_prec += (1/index)

100%|██████████| 2675/2675 [02:25<00:00, 18.38it/s]


In [10]:
MRR = total_prec / len(queries)
print(MRR)

0.2586014349098455


# Results

In [11]:
print("MAP@10: " + str(MAP))
print("Recall@10: " + str(RECALL))
print("MRR@10: " + str(MRR))

MAP@10: 0.19883865895267738
Recall@10: 0.33382242990654215
MRR@10: 0.2586014349098455
