# Citation Recommendation on Scholarly Legal Articles

## Law2Vec

### Libraries

In [7]:
import pickle
import numpy as np
from tqdm import tqdm
from sklearn.metrics import pairwise

### Dataset

In [8]:
with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_docs.pkl', 'rb') as f:
    docs = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_queries.pkl', 'rb') as f:
    queries = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/LATEST_test_data.pkl', 'rb') as f:
    pair = pickle.load(f)

### Get Word Vectors

##### 1. Load Pretrained Word Vectors

In [9]:
from gensim.models import KeyedVectors

vecs = KeyedVectors.load_word2vec_format('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/EMBEDDINGS/Law2Vec/Law2Vec.200d.txt', binary=False)

##### 2. Calculate Text Embeddings for Parent Articles

In [12]:
parent_emb = []
for i in tqdm(range(len(docs))):
    sample = docs[i]
    emb = np.zeros((200,))
    count = 0
    for m in sample.split():
        if m in list(vecs.key_to_index.keys()):
            emb = np.add(emb, vecs.get_vector(m))
            count += 1

    if count == 0:
        print(i)
        break

    emb = emb / count

    parent_emb.append([sample[0], sample[1], emb])

100%|██████████| 3050/3050 [1:26:01<00:00,  1.69s/it]


In [14]:
parent_emb[0]

['=',
 '6',
 array([ 3.97850425e-03,  1.00619585e-01,  1.00869127e-02,  5.02373433e-03,
        -4.57001598e-02,  4.77381499e-02,  1.57776273e-02, -2.68681483e-01,
         6.93469123e-02,  3.88348616e-02, -1.12256512e-01, -1.78832757e-01,
        -1.36765893e-01, -1.11228866e-01,  3.74681109e-02,  2.43146314e-01,
        -1.80955784e-01, -3.54724614e-02,  1.38232906e-01, -2.18817504e-01,
        -4.56883992e-02, -1.71863802e-01,  1.79219684e-01,  5.39135774e-02,
        -1.80302575e-01, -2.02870065e-02,  1.63779228e-01, -6.31798438e-02,
        -1.07657642e-01, -1.71255429e-01,  3.82156787e-01, -8.89885980e-02,
        -1.93573924e-01, -5.12881756e-02,  3.68622367e-02,  1.13818717e-01,
         3.04673927e-02,  1.73152665e-02, -6.68234741e-02, -8.41478789e-02,
        -6.15781723e-02, -1.41522299e-01,  1.37841335e-01,  1.20675659e-01,
         5.46632542e-02, -5.87175812e-02, -9.71874445e-02,  9.17159822e-02,
        -3.18387481e-02,  1.13423856e-01,  3.87840320e-02,  2.27526053e-01,


In [21]:
with open("LATEST_law2vec_parent_embeddings.pkl", 'wb') as f:
    pickle.dump(parent_emb, f)

##### 3. Calculate Text Embeddings for Child Articles

In [22]:
child_emb = []
for i in tqdm(range(len(queries))):
    sample = queries[i]
    emb = np.zeros((200,))
    count = 0
    for m in sample.split():
        if m in list(vecs.key_to_index.keys()):
            emb = np.add(emb, vecs.get_vector(m))
            count += 1

    if count == 0:
        print(i)
        break

    emb = emb / count

    child_emb.append(emb)

100%|██████████| 2675/2675 [2:38:06<00:00,  3.55s/it]     


In [23]:
with open("law2vec_child_embeddings.pkl", 'wb') as f:
    pickle.dump(child_emb, f)

### Evaluate

In [24]:
with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/parent_text.pkl', 'rb') as f:
    parent = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/child_text.pkl', 'rb') as f:
    child = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/parent_child_match.pkl', 'rb') as f:
    parent_child_match = pickle.load(f)

In [4]:
with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/EMBEDDINGS/Law2Vec/law2vec_parent_embeddings.pkl', 'rb') as f:
    parent_embeddings_list = pickle.load(f)

with open('/Users/dgknrsln/Documents/pythonProject/EXPERIMENTS/EMBEDDINGS/Law2Vec/law2vec_child_embeddings.pkl', 'rb') as f:
    child_embeddings_list = pickle.load(f)

In [5]:
parent_embeddings = []
for i in parent_embeddings_list:
    parent_embeddings.append(i[2])

child_embeddings = []
for i in child_embeddings_list:
    child_embeddings.append(i[2])

In [41]:
parent_emd = []
for i in parent_emb:
    parent_emd.append(i[2])

In [43]:
len(parent_emd[0])

200

In [44]:
cos_matrix = pairwise.cosine_similarity(child_emb, parent_emd)

In [45]:
cos_matrix.shape


(2675, 3050)

### 1. MAP

In [46]:
found = 0
total_prec = 0
for i in tqdm(range(len(cos_matrix))):
    sample = cos_matrix[i]
    results = list(sorted(enumerate(sample), key=lambda item: item[1], reverse=True))

    embd_set = set()
    total_list = []
    for r in results:
        if r[1] not in embd_set:
            embd_set.add(r[1])
            total_list.append(r)

    results = total_list[:10]

    count = 0
    precision = 0
    index = 0
    for m in results:
        if docs[m[0]] in pair[i][1]:
            count += 1
            precision += count/(index+1)
        index += 1

    if count == 0:
        precision = 0
    else:
        found += 1
        precision /= count

    total_prec += precision

100%|██████████| 2675/2675 [00:07<00:00, 347.74it/s]


In [47]:
MAP = total_prec / len(docs)

### 2. Recall

In [48]:
found = 0
total_prec = 0
for i in tqdm(range(len(cos_matrix))):
    sample = cos_matrix[i]
    results = list(sorted(enumerate(sample), key=lambda item: item[1], reverse=True))

    embd_set = set()
    total_list = []
    for r in results:
        if r[1] not in embd_set:
            embd_set.add(r[1])
            total_list.append(r)

    results = total_list[:10]

    count = 0
    for m in results:
        if docs[m[0]] in pair[i][1]:
            count += 1

    total_prec += (count / len(pair[i][1]))

100%|██████████| 2675/2675 [00:07<00:00, 349.18it/s]


In [49]:
RECALL = total_prec / len(docs)

### 3. MRR

In [50]:
found = 0
total_prec = 0
for i in tqdm(range(len(cos_matrix))):
    sample = cos_matrix[i]
    results = list(sorted(enumerate(sample), key=lambda item: item[1], reverse=True))

    embd_set = set()
    total_list = []
    for r in results:
        if r[1] not in embd_set:
            embd_set.add(r[1])
            total_list.append(r)

    results = total_list[:10]

    index = 1
    for m in results:
        if docs[m[0]] in pair[i][1]:
            break
        index += 1

    total_prec += (1/index)

100%|██████████| 2675/2675 [00:07<00:00, 349.98it/s]


In [51]:
MRR = total_prec / len(docs)

In [52]:
print("MAP@10: " + str(MAP))
print("Recall@10: " + str(RECALL))
print("MRR@10: " + str(MRR))

MAP@10: 0.10962691907364026
Recall@10: 0.2078469945355191
MRR@10: 0.16964635819553103
