In [1]:
!pip install --upgrade transformers==4.2

[0mCollecting transformers==4.2
  Downloading transformers-4.2.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m93.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp39-cp39-manylinux2010_x86_64.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m207.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock
  Downloading filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m348.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting regex!=2019.12.17
  Downloading regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m773.3/773.3 kB[0m [31m346.5 MB/s[0m eta [36m0:00:0

In [2]:
!pip install torch tensorflow flax

[0mCollecting torch
  Downloading torch-2.1.0-cp39-cp39-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m296.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tensorflow
  Downloading tensorflow-2.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m230.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting flax
  Downloading flax-0.7.4-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m357.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m265.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12

In [3]:
!pip install scikit-learn

[0mCollecting scikit-learn
  Downloading scikit_learn-1.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m254.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.3.1 threadpoolctl-3.2.0
[0m

In [144]:
import numpy as np
from collections import defaultdict
from typing import List, Tuple

import pandas as pd

from transformers import AutoTokenizer, AutoModel
from sklearn.random_projection import GaussianRandomProjection

import pickle

Build a vector database

In [145]:
def cosine(v1: np.ndarray, v2: np.ndarray) -> float:
    result = np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2))
    return result

In [146]:
class VectorDatabase:
    def __init__(self):
        self.vectors = defaultdict(np.ndarray)

    def insert(self, key: str, vector: np.ndarray) -> None:
        self.vectors[key] = vector

    def search(self, query_vector: np.ndarray, k: int) -> List[Tuple[str, float]]:
        similarities = [(key, cosine(query_vector, vector)) for key, vector in self.vectors.items()]
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]

    def retrieve(self, key: str) -> np.ndarray:
        return self.vectors.get(key, None)

In [147]:
%%time

# get embeddings from the file
df = pd.read_csv("cord_19_embeddings_2022-06-02.csv", header=None)

CPU times: user 2min 30s, sys: 9.17 s, total: 2min 39s
Wall time: 2min 39s


In [148]:
# create a vector database
vector_db = VectorDatabase()

In [149]:
%%time

# insert embeddings into the database
for idx, row in df.iterrows():
    key = row[0]
    embedding = np.asarray(row[1:].tolist())
        
    # Gaussian random projection    
    embedding = embedding.reshape(1, -1)
    transformer = GaussianRandomProjection(n_components=2)
    embedding_new = transformer.fit_transform(embedding)
    embedding_new = np.squeeze(embedding_new)
    
    vector_db.insert(key, embedding_new)

CPU times: user 11min 3s, sys: 15.7 s, total: 11min 19s
Wall time: 11min 19s


In [150]:
vector_db.retrieve("ug7v899j").shape

(2,)

In [151]:
vector_db.retrieve("ug7v899j")

array([47.87471967, -8.36467207])

In [152]:
# # save the vector database to a file
# with open("CORD19_vector_database.p", "wb") as f:
#     pickle.dump(vector_db, f, pickle.HIGHEST_PROTOCOL)

In [153]:
# # load the vector database
# with open("CORD19_vector_database.p", "rb") as f:
#     vector_db = pickle.load(f)

Embed the query

In [154]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

In [155]:
paper = {"title": '''Electrospray ionisation-cleavable tandem nucleic acid mass tag–peptide nucleic acid conjugates: synthesis and applications to quantitative genomic analysis using electrospray ionisation-MS/MS''',
        "abstract": '''The synthesis and characterization of isotopomer tandem nucleic acid mass tag–peptide nucleic acid (TNT–PNA) conjugates is described along with their use as electrospray ionisation-cleavable (ESI-Cleavable) hybridization probes for the detection and quantification of target DNA sequences by electrospray ionisation tandem mass spectrometry (ESI-MS/MS). ESI-cleavable peptide TNT isotopomers were introduced into PNA oligonucleotide sequences in a total synthesis approach. These conjugates were evaluated as hybridization probes for the detection and quantification of immobilized synthetic target DNAs using ESI-MS/MS. In these experiments, the PNA portion of the conjugate acts as a hybridization probe, whereas the peptide TNT is released in a collision-based process during the ionization of the probe conjugate in the electrospray ion source. The cleaved TNT acts as a uniquely resolvable marker to identify and quantify a unique target DNA sequence. The method should be applicable to a wide variety of assays requiring highly multiplexed, quantitative DNA/RNA analysis, including gene expression monitoring, genetic profiling and the detection of pathogens.'''}

In [156]:
query  = paper.get('title') + tokenizer.sep_token + (paper.get('abstract') or '')
inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=512)
result = model(**inputs)

query_embedding = result.last_hidden_state[:, 0, :]
query_embedding = query_embedding.detach().numpy()
query_embedding = query_embedding[0]
query_embedding.shape

(768,)

In [157]:
# Gaussian random projection
query_embedding = query_embedding.reshape(1, -1)
query_embedding_new = transformer.fit_transform(query_embedding)
query_embedding_new = np.squeeze(query_embedding_new)
query_embedding_new.shape

(2,)

In [158]:
query_embedding_new

array([-18.344776, -23.472614], dtype=float32)

Query the database with the query embedding

In [159]:
%%time
result = vector_db.search(query_embedding_new, k=10)

CPU times: user 6.52 s, sys: 19.9 ms, total: 6.54 s
Wall time: 6.54 s


In [160]:
result

[('vrf0glt3', 0.9999999639101045),
 ('pzeswvki', 0.999999963887779),
 ('r719jgyp', 0.9999999638794606),
 ('5orzi554', 0.9999999638447985),
 ('kat403cq', 0.9999999638437028),
 ('9lzj1w13', 0.9999999638300053),
 ('gutuidls', 0.9999999638278902),
 ('2esmerz6', 0.9999999638171791),
 ('duog6bw3', 0.9999999636941234),
 ('my3j2ox0', 0.9999999634843472)]

In [166]:
# res = result[0][0]
res = "ug7v899j"

Retrieve the article corresponding to the best result

In [162]:
# metadata_df = pd.read_csv("metadata.csv")

  metadata_df = pd.read_csv("metadata.csv")


In [167]:
metadata_df[metadata_df.cord_uid == res]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,


In [164]:
doi = metadata_df[metadata_df.cord_uid == res]["doi"].to_string(index=False)
title = metadata_df[metadata_df.cord_uid == res]["title"].to_string(index=False)
abstract = metadata_df[metadata_df.cord_uid == res]["abstract"].to_string(index=False)

In [165]:
print(f'''
doi: {doi}
title: {title}
abstract: {abstract}''')


doi: 10.1177/09722629211043299
title: Factors Impacting Resilience of Women Entrepren...
abstract: Although enough research has been carried out o...
