In [3]:
import numpy as np
#sklearn
from lightning.classification import LinearSVC
from sklearn.model_selection import GridSearchCV
import logging
import datasets
import os
from sklearn.metrics.pairwise import euclidean_distances

In [106]:
# Uses SciRepEval code to train a support vector classifier
RANDOM_STATE=42
def classify(x_train: np.ndarray, x_test: np.ndarray, y_train: np.ndarray, cv: int = 3,
             n_jobs: int = 5):
    Cs = np.logspace(-2, 2, 5)
    estimator = LinearSVC(loss="squared_hinge", random_state=RANDOM_STATE)
    svm = GridSearchCV(estimator=estimator, cv=cv, param_grid={'C': Cs}, verbose=1, n_jobs=n_jobs)
    svm.fit(x_train, y_train)
    preds = svm.predict(x_test)
    return preds

In [71]:
# Load corpus ids
qpaper = []
cpaper = []
rpaper = []

with open ('qpaper_to_emb', 'r') as f:
    for line in f:
        qpaper.append(line.strip())

with open ('cpaper_to_emb', 'r') as f:
    for line in f:
        cpaper.append(line.strip())

with open ('rpaper_to_emb', 'r') as f:
    for line in f:
        rpaper.append(line.strip())

In [7]:
# Load SPECTER embeddings
q_emb = np.loadtxt('qpaper.specter')
c_emb = np.loadtxt('cpaper.specter')
r_emb = np.loadtxt('rpaper.specter')

In [15]:
q_emb.dtype, q_emb.shape, c_emb.dtype, c_emb.shape, r_emb.dtype, r_emb.shape

(dtype('float64'),
 (115, 768),
 dtype('float64'),
 (637, 768),
 dtype('float64'),
 (234, 768))

In [96]:
# Set up the qid, cid, label dict
qrel = {}
with open('reduced.test.qrel.cid', 'r') as f:
    for line in f:
        temp = line.split(' ')
        qid = temp[0].strip()
        cid = temp[1].strip()
        label = temp[2].strip()
        if qid not in qrel:
            qrel[qid] = {}
        qrel[qid][cid] = label

In [32]:
# Count the positive labels
positive_counter = 0
qrel = {}
with open('reduced.test.qrel.cid', 'r') as f:
    for line in f:
        temp = line.split(' ')
        label = temp[2].strip()
        if label == "1":
            positive_counter += 1
positive_counter            

78

In [57]:
# Set up corpus id and retrieved id dict
corpus_r = {}
q_counter = set()
c_counter = set()
with open('link-recorder-final-1', 'r') as f:
    for line in f:
        temp = line.split('\t')
        corpus_id = temp[0].strip()
        r_id = temp[1].strip()
        if corpus_id in qpaper:
            q_counter.add(corpus_id)
            if corpus_id not in corpus_r:
                corpus_r[corpus_id] = {r_id}
            else:
                corpus_r[corpus_id].add(r_id)
        
        if corpus_id in cpaper:
            c_counter.add(corpus_id)
            if corpus_id not in corpus_r:
                corpus_r[corpus_id] = {r_id}
            else:
                corpus_r[corpus_id].add(r_id)
           
len(corpus_r)

221

In [50]:
len(q_counter), len(c_counter)

(28, 193)

In [58]:
r_corpus = {}
for k, v in corpus_r.items():
    v = list(v)
    for i in range(len(v)):
        if v[i] not in r_corpus:
            r_corpus[v[i]] = {k}
        else:
            r_corpus[v[i]].add(k)

In [59]:
len(r_corpus)

234

In [36]:
# ratio of positive labels in test set
78 / 655

0.11908396946564885

In [61]:
# an ideal number of negative samples in train set
234 / 0.119

1966.3865546218487

In [64]:
# negative samples needed per each positive sample
1966 / 234

8.401709401709402

In [65]:
# Final total for X_train
234 * 9

2106

In [77]:
import random

def generate_random_numbers(n, range_max, exclude):
    numbers = set(range(range_max)) - {exclude}
    return random.sample(numbers, n)

In [82]:
# Concatenate the test set paper emb and retrieved paper emb to construct X_train and x_test
# Turns out that there are two retrieved papers that each matches with two test set papers. 
X_train = []

for idx in range(len(rpaper)):
    r_id = rpaper[idx]
    retrieved_emb = r_emb[idx]
    t_ids = r_corpus[r_id]
    for t_id in t_ids:
        if t_id in qpaper:
            index = qpaper.index(t_id)
            t_emb = q_emb[index]
        else:
            index = cpaper.index(t_id)
            t_emb = c_emb[index]
        p_train_sample = np.hstack((t_emb, retrieved_emb))
        X_train.append(p_train_sample)
        r_ids = generate_random_numbers(8, len(rpaper), idx)
        for id in r_ids:
            r_id = rpaper[id]
            retrieved_emb = r_emb[id]
            n_train_sample = np.hstack((t_emb, retrieved_emb))
            X_train.append(n_train_sample)

X_train = np.array(X_train, dtype=np.float64)
X_train.shape

since Python 3.9 and will be removed in a subsequent version.
  return random.sample(numbers, n)


(2124, 1536)

In [92]:
# Create y_train where one positive label is followed by 8 negative labels. 
pattern = [1] + [0] * 8
full_repetitions = 2124 // len(pattern)
y_train = (pattern * (full_repetitions))
y_train = np.array(y_train, dtype=np.float64)
y_train.shape

(2124,)

In [101]:
X_test, y_test = [], []
for qid, cdict in qrel.items():
    q = q_emb[qpaper.index(qid)]
    for cid, label in cdict.items():
        c = c_emb[cpaper.index(cid)]
        train_sample = np.hstack((q, c))
        X_test.append(train_sample)
        y_test.append(int(label))
X_test = np.array(X_test, dtype=np.float64)
y_test = np.array(y_test, dtype=np.float64)
X_test.shape, y_test.shape

((622, 1536), (622,))

In [107]:
pred = classify(X_train, X_test, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [108]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [112]:
num_correct = 0
for i in range(len(pred)):
    if pred[i] == y_test[i]:
        num_correct += 1
accuracy = num_correct  / len(pred)
accuracy

0.8745980707395499