In [2]:
!pip install --upgrade transformers==4.2

[0mCollecting transformers==4.2
  Downloading transformers-4.2.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m173.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp39-cp39-manylinux2010_x86_64.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m279.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock
  Downloading filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m343.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting regex!=2019.12.17
  Downloading regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m773.3/773.3 kB[0m [31m341.3 MB/s[0m eta [36m0:00:

In [3]:
!pip install torch tensorflow flax

[0mCollecting torch
  Downloading torch-2.1.0-cp39-cp39-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m156.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tensorflow
  Downloading tensorflow-2.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m277.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting flax
  Downloading flax-0.7.4-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m342.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sympy
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m295.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64

In [74]:
!pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.3.1 threadpoolctl-3.2.0
[0m

In [1]:
import numpy as np
from collections import defaultdict
from typing import List, Tuple

import pandas as pd

from transformers import AutoTokenizer, AutoModel

import pickle

Build a vector database

In [2]:
def cosine(v1: np.ndarray, v2: np.ndarray) -> float:
    result = np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2))
    return result

In [3]:
class VectorDatabase:
    def __init__(self):
        self.vectors = defaultdict(np.ndarray)

    def insert(self, key: str, vector: np.ndarray) -> None:
        self.vectors[key] = vector

    def search(self, query_vector: np.ndarray, k: int) -> List[Tuple[str, float]]:
        similarities = [(key, cosine(query_vector, vector)) for key, vector in self.vectors.items()]
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]

    def retrieve(self, key: str) -> np.ndarray:
        return self.vectors.get(key, None)

In [4]:
%%time

# get embeddings from the file
df = pd.read_csv("cord_19_embeddings_2022-06-02.csv", header=None)

CPU times: user 2min 36s, sys: 13.6 s, total: 2min 50s
Wall time: 2min 50s


In [5]:
# create a vector database
vector_db = VectorDatabase()

In [6]:
%%time

# insert embeddings into the database
for idx, row in df.iterrows():
    key = row[0]
    embedding = np.asarray(row[1:].tolist())
    vector_db.insert(key, embedding)

CPU times: user 3min 25s, sys: 15.3 s, total: 3min 41s
Wall time: 3min 41s


In [145]:
vector_db.retrieve("ug7v899j")

array([-2.93998361e+00, -6.31220055e+00, -1.04590309e+00,  5.16416264e+00,
       -3.25646371e-01, -2.50741339e+00,  1.73560870e+00,  1.93635666e+00,
        6.22501016e-01,  1.56131625e+00,  2.74769139e+00, -6.93097711e-01,
       -3.71040940e-01, -5.88572502e+00, -1.09748483e+00, -4.65481567e+00,
       -2.31646371e+00,  2.87495947e+00,  3.87578082e+00, -2.95981139e-01,
        1.77025485e+00,  4.40059328e+00, -4.05934751e-01, -1.71624398e+00,
       -4.76663411e-01, -2.57071376e+00,  4.59941769e+00,  1.63527846e-01,
       -1.83473134e+00,  2.55786443e+00, -3.79167843e+00, -6.22489691e+00,
       -3.53383869e-01, -1.08378506e+00, -1.04406953e-01, -2.09846401e+00,
       -1.63153899e+00, -5.29200658e-02, -4.06286144e+00, -1.28650916e+00,
       -1.26636600e+00, -1.17296696e+00,  4.73816156e+00, -3.18683219e+00,
       -1.56449735e+00,  7.11834431e-01, -3.77992725e+00,  8.74405861e-01,
        3.15015078e+00, -1.44715142e+00,  1.34749025e-01,  1.89116776e+00,
        2.67670965e+00, -

In [119]:
tmp1.shape

(768,)

In [7]:
# save the vector database to a file
with open("CORD19_vector_database.p", "wb") as f:
    pickle.dump(vector_db, f, pickle.HIGHEST_PROTOCOL)

In [5]:
# load the vector database
with open("CORD19_vector_database.p", "rb") as f:
    vector_db = pickle.load(f)

KeyboardInterrupt: 

Embed the query

In [7]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

In [164]:
paper = {"title": '''Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia''',
        "abstract": '''Objective'''}

In [165]:
query = paper.get('title') + tokenizer.sep_token + (paper.get('abstract') or '')

In [166]:
query

'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia[SEP]Objective'

In [167]:
inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=512)

In [168]:
result = model(**inputs)

In [169]:
query_embedding = result.last_hidden_state[:, 0, :]

In [170]:
query_embedding = query_embedding.detach().numpy()

In [154]:
query_embedding = query_embedding[0]

In [171]:
query_embedding.shape

(1, 768)

Query the database with the query embedding

In [161]:
%%time
result = vector_db.search(query_embedding, k=100)

CPU times: user 7.74 s, sys: 0 ns, total: 7.74 s
Wall time: 7.73 s


In [158]:
res = result[0][0]

Retrieve the article corresponding to the best result

In [159]:
# metadata_df = pd.read_csv("metadata.csv")

In [160]:
metadata_df[metadata_df.cord_uid == res]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
573642,1t55jhaa,,WHO,Coherent Stokes and anti-Stokes high-order com...,,,,unk,"In this Letter, we report on experimental effe...",2020,"Shevchenko, M A; Chaikov, L L; Tcherniega, N V",Spectrochim Acta A Mol Biomol Spectrosc,,#32942113,,,,,221787612.0


In [137]:
doi = metadata_df[metadata_df.cord_uid == res]["doi"].to_string(index=False)
title = metadata_df[metadata_df.cord_uid == res]["title"].to_string(index=False)
abstract = metadata_df[metadata_df.cord_uid == res]["abstract"].to_string(index=False)

In [138]:
print(f'''
doi: {doi}
title: {title}
abstract: {abstract}''')


doi: NaN
title: Probing the sensitivity to leptonic $\delta_{CP...
abstract: One of the main neutrino oscillation parameters...
