In [2]:
!pip install --upgrade transformers==4.2

[0m

In [3]:
!pip install torch tensorflow flax

[0m

In [4]:
import numpy as np
from collections import defaultdict
from typing import List, Tuple

import pandas as pd

from transformers import AutoTokenizer, AutoModel

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

Build a vector database

In [5]:
class VectorDatabase:
    def __init__(self):
        self.vectors = defaultdict(np.ndarray)

    def insert(self, key: str, vector: np.ndarray) -> None:
        self.vectors[key] = vector

    def search(self, query_vector: np.ndarray, k: int) -> List[Tuple[str, float]]:
        similarities = [(key, cosine(query_vector, vector)) for key, vector in self.vectors.items()]
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]

    def retrieve(self, key: str) -> np.ndarray:
        return self.vectors.get(key, None)

In [6]:
def cosine(v1: np.ndarray, v2: np.ndarray) -> float:
    result = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return result

In [53]:
# create a vector database
vector_db = VectorDatabase()

Embed the query

In [8]:
metadata_df = pd.read_csv("metadata.csv")

  metadata_df = pd.read_csv("metadata.csv")


In [29]:
keys = metadata_df['cord_uid']
titles = metadata_df['title']
abstracts = metadata_df['abstract']

In [10]:
type(titles)

pandas.core.series.Series

In [11]:
papers = []
for i in range(len(titles)):
    one_paper_dict = {}
    one_paper_dict['title'] = str(titles[i])
    one_paper_dict['abstract'] = str(abstracts[i])
    papers.append(one_paper_dict)

In [12]:
# COMMENTED BECAUSE INEFFICIENT (MAKES WHOLE EMBEDDING AT ONCE SO CONSUMES LOT OF RAM)

# # concatenate title and abstract
# title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in papers]
# # preprocess the input
# inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
# result = model(**inputs)
# # take the first token in the batch as the embedding
# embeddings = result.last_hidden_state[:, 0, :]

In [13]:
embeddings = []
for itr, paper in enumerate(papers):
    title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in [paper]]
    # preprocess the input
    inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    result = model(**inputs)
    # take the first token in the batch as the embedding
    embeddings.append(result.last_hidden_state[:, 0, :])
    if itr == 10000:
        break

In [52]:
new_embeddings = []
for i in range(len(embeddings)):
    new_embedding = embeddings[i].detach().numpy()
    new_embedding = np.squeeze(new_embedding)
    new_embeddings.append(new_embedding)

In [50]:
# import pickle
# with open('embeddings_10000.p', 'wb') as f:
#     pickle.dump(new_embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)

In [60]:
len(new_embeddings)

10001

In [62]:
# CURRENTLY IT WILL INSERT INTO THE ALREADY EXISTING DATABASE, BUT IT SHOULDN'T MATTER
# keys = np.random.randint(1000, size=40)
# insert embeddings into the database
for i, el in enumerate(new_embeddings):
    key = keys[i]   
    vector_db.insert(key, el)

In [71]:
query = "Hypertension"
inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=512)
result = model(**inputs)
query_embedding = result.last_hidden_state[:, 0, :]
query_embedding = query_embedding.detach().numpy()

In [72]:
%%time
result = vector_db.search(query_embedding, k=5)
result # result will give the key for the best matched paper

CPU times: user 188 ms, sys: 4.11 ms, total: 192 ms
Wall time: 191 ms


[('ahiekwch', array([0.83187574], dtype=float32)),
 ('g6jae62w', array([0.8142808], dtype=float32)),
 ('yww098fu', array([0.8142808], dtype=float32)),
 ('umnn0kea', array([0.8142808], dtype=float32)),
 ('1fka66ul', array([0.8142808], dtype=float32))]

In [73]:
best_result = result[0][0]

In [76]:
metadata_df[metadata_df['cord_uid'] == best_result]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
7692,ahiekwch,,PMC,Special assorted cardiovascular topics,10.1016/j.pcad.2020.03.016,PMC7118538,32224111,no-cc,,2020-03-26,"Lavie, Carl J.",Prog Cardiovasc Dis,,,,,document_parses/pmc_json/PMC7118538.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,
