In [None]:
!pip install --upgrade transformers==4.2

In [None]:
!pip install torch tensorflow flax

In [8]:
import numpy as np
from collections import defaultdict
from typing import List, Tuple

import pandas as pd

from transformers import AutoTokenizer, AutoModel

Build a vector database

In [21]:
class VectorDatabase:
    def __init__(self):
        self.vectors = defaultdict(np.ndarray)

    def insert(self, key: str, vector: np.ndarray) -> None:
        self.vectors[key] = vector

    def search(self, query_vector: np.ndarray, k: int) -> List[Tuple[str, float]]:
        similarities = [(key, cosine(query_vector, vector)) for key, vector in self.vectors.items()]
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]

    def retrieve(self, key: str) -> np.ndarray:
        return self.vectors.get(key, None)

In [22]:
def cosine(v1: np.ndarray, v2: np.ndarray) -> float:
    result = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return result

In [23]:
# get embeddings from the file
df = pd.read_csv("cord_19_embeddings_2022-06-02_test.csv", header=None)

In [24]:
# create a vector database
vector_db = VectorDatabase()

In [25]:
# insert embeddings into the database
for idx, row in df.iterrows():
    key = row[0]
    embedding = np.asarray(row[1:].tolist())
    vector_db.insert(key, embedding)

Embed the query

In [26]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

In [27]:
query = "combinations of features predispose cohorts to virus susceptibility"

In [28]:
inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=512)

In [29]:
result = model(**inputs)

In [30]:
query_embedding = result.last_hidden_state[:, 0, :]

In [31]:
query_embedding.shape

torch.Size([1, 768])

In [35]:
query_embedding = query_embedding.detach().numpy()

Query the database with the query embedding

In [51]:
%%time
result = vector_db.search(query_embedding, k=5)

CPU times: user 16.7 ms, sys: 0 ns, total: 16.7 ms
Wall time: 15.7 ms


In [40]:
result

[('navrmhqm', array([0.09786049])),
 ('q26f8pv4', array([0.09138111])),
 ('xjspi65a', array([0.08952262])),
 ('t0y0b0gb', array([0.08932958])),
 ('2p7qrgx0', array([0.08474437]))]

Retrieve the article corresponding to the best result

In [44]:
metadata_df = pd.read_csv("metadata_test.csv")

In [50]:
metadata_df[metadata_df.cord_uid == "navrmhqm"]

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
143,143,navrmhqm,ff19a75ba16c97c96201fc7f4fe6dd56e05167fa,PMC,The intrinsically disordered C‐terminal domain...,10.1110/ps.051411805,PMC2279309,16046624,no-cc,"Measles virus is a negative‐sense, single‐stra...",2009-01-01,"Bourhis, Jean‐Marie; Receveur‐Bréchot, Véroniq...",Protein Sci,,,,document_parses/pdf_json/ff19a75ba16c97c96201f...,document_parses/pmc_json/PMC2279309.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,


In [47]:
doi = metadata_df[metadata_df.cord_uid == "navrmhqm"]["doi"]
title = metadata_df[metadata_df.cord_uid == "navrmhqm"]["title"]

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,pci4rhkn,6cc3a580bd820899b916d05d1ef06b478b367b35,PMC,Automatic Detection and Quantification of Tree...,10.1109/tbme.2012.2190984,PMC3511590,22434795,cc-by,This study presents a novel computer-assisted ...,2012-03-14,"Bagci, Ulas; Yao, Jianhua; Wu, Albert; Caban, ...",IEEE Trans Biomed Eng,,,,document_parses/pdf_json/6cc3a580bd820899b916d...,document_parses/pmc_json/PMC3511590.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
996,996,880nqc0f,ed2de3694f5580ea38f4adf24bd1c8b46862df1f,PMC,Mannose-binding lectin deficiency and acute ex...,10.2147/copd.s33714,PMC3514010,23226013,no-cc,BACKGROUND: Mannose-binding lectin is a collec...,2012-11-23,"Albert, Richard K; Connett, John; Curtis, Jeff...",Int J Chron Obstruct Pulmon Dis,,,,document_parses/pdf_json/ed2de3694f5580ea38f4a...,document_parses/pmc_json/PMC3514010.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
997,997,01b0vnnm,2a7c951e191425fd9fa5ac108f07a1f02eb75872,PMC,The changing phenotype of microglia from homeo...,10.1186/2047-9158-1-9,PMC3514090,23210447,cc-by,It has been nearly a century since the early d...,2012-04-24,"Luo, Xiao-Guang; Chen, Sheng-Di",Transl Neurodegener,,,,document_parses/pdf_json/2a7c951e191425fd9fa5a...,document_parses/pmc_json/PMC3514090.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
998,998,5b29wtim,854e623d1f875e4605b2ffd3f72599d063a56cc0,PMC,Diversity of Salmonella spp. serovars isolated...,10.1186/1746-6148-8-201,PMC3514206,23098237,cc-by,BACKGROUND: Salmonellosis in water buffalo (Bu...,2012-10-25,"Borriello, Giorgia; Lucibelli, Maria G; Pescia...",BMC Vet Res,,,,document_parses/pdf_json/854e623d1f875e4605b2f...,document_parses/pmc_json/PMC3514206.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
