# Create Annoy Index

In [1]:
%load_ext autoreload
%autoreload 2
from cord.core import cord_support_dir, DOCUMENT_VECTOR_LENGTH, DOCUMENT_VECTOR_PATH, lookup_by_sha, SIMILARITY_INDEX_PATH
from cord import ResearchPapers
import pandas as pd
import numpy as np
from pathlib import Path, PurePath

## 1. Load ResearchPapers

In [2]:
research_papers = ResearchPapers.load()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 35.0 seconds


In [3]:
metadata = research_papers.metadata.copy()

## 2. Load Document Vectors

In [4]:
document_vectors = pd.read_parquet(DOCUMENT_VECTOR_PATH)

## 3. Create a Document Vector Map

In [5]:
sha_docvec_map = document_vectors[['sha', 'document_vector']].set_index('sha').to_dict()['document_vector']
pmc_docvec_map = document_vectors[['pmcid', 'document_vector']].set_index('pmcid').to_dict()['document_vector']

## 4. Set Document Vectors

In [6]:
EMPTY_VECTOR = np.zeros(DOCUMENT_VECTOR_LENGTH, dtype=np.float32)
has_sha = ~metadata.sha.isnull()
has_pmcid = ~metadata.pmcid.isnull()
sha_doc_vectors = metadata.sha.apply(lambda sha: lookup_by_sha(sha, sha_docvec_map, not_found=np.nan))
pmc_doc_vectors = metadata.pmcid.apply(lambda pmcid: lookup_by_sha(pmcid, pmc_docvec_map, not_found=np.nan))

# Set the document vector from the SHA
metadata['document_vector'] = metadata.sha.apply(lambda sha: 
                                        lookup_by_sha(sha, sha_docvec_map, not_found=EMPTY_VECTOR))
# Set the remaining document vectors from the PMCID
remaining_idxs = (~has_sha) & (has_pmcid)
metadata.loc[remaining_idxs, 'document_vector'] \
                = metadata.loc[remaining_idxs].pmcid.apply(lambda pmcid: 
                                             lookup_by_sha(pmcid, pmc_docvec_map, not_found=EMPTY_VECTOR))

In [7]:
metadata.loc[remaining_idxs]

Unnamed: 0,cord_uid,sha,source,title,doi,pmcid,pubmed_id,license,abstract,published,...,full_text_file,url,when,covid_related,virus,coronavirus,sars,index_tokens,antivirals,document_vector
16163,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,...,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,17 years ago,False,False,False,False,"[army, men, death, john, bunyans, memorable, p...",,"[1.7728125, 1.4531081, 0.8247704, -0.6711002, ..."
16180,kpaqpge5,,PMC,SARS Resources,10.1186/cc2383,PMC374376,,no-cc,SARS Resources,2003-09-29,...,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,16 years ago,False,False,False,True,"[sars, resources]",,"[0.694375, 1.1047568, 0.9975648, -0.41311225, ..."
16203,myolq6r9,,PMC,Structure of a Conserved RNA Element in the SA...,10.1371/journal.pbio.0030029,PMC539063,,cc-by,Structure of a Conserved RNA Element in the SA...,2004-12-28,...,comm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,15 years ago,False,True,False,True,"[structure, conserved, rna, element, sars, vir...",,"[1.28741, 0.97307736, 0.9468804, -0.8082178, 0..."
16223,7336uosi,,PMC,Mass Spectometry–Based SARS Genotyping,10.1371/journal.pmed.0020052,PMC549598,,cc-by,Mass Spectometry–Based SARS Genotyping,2005-02-22,...,comm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,15 years ago,False,False,False,True,"[mass, sars, genotyping]",,"[0.93660665, -0.36359018, 0.26684007, -0.75781..."
16246,hlyn3a9q,,PMC,New Frameshifting Pseudoknot Found in SARS Virus,10.1371/journal.pbio.0030199,PMC1110910,,cc-by,New Frameshifting Pseudoknot Found in SARS Virus,2005-05-17,...,comm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,15 years ago,False,True,False,True,"[new, frameshifting, pseudoknot, found, sars, ...",,"[1.7002391, 0.6633573, 1.4986031, -0.028027477..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47228,jetjxf90,,PMC,Clinical analysis of 10 neonates born to mothe...,10.21037/tp.2020.02.06,PMC7036645,32154135,unk,": The newly identified 2019-nCoV, which appear...",2020-02-01,...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2 months ago,True,False,False,False,"[newly, identified, 2019-ncov, appears, origin...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
47233,notil1e2,,PMC,,10.15585/mmwr.mm6906a5,PMC7017960,32053580,cc-by,,2020-02-14,...,comm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,1 month ago,False,False,False,False,[],,"[0.06579252, 0.16770618, 0.5764077, -0.3882277..."
47236,ids74oez,,PMC,The E Protein Is a Multifunctional Membrane Pr...,10.1016/s1672-0229(03)01017-9,PMC5172412,15626343,cc-by-nc-nd,The E (envelope) protein is the smallest struc...,2016-11-28,...,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,3 years ago,False,True,True,True,"[envelope, protein, smallest, structural, prot...",,"[2.7915924, 1.5613698, 2.1324778, -1.397276, 1..."
47257,lku39zrf,,Elsevier,Surface-enhanced Raman scattering and DFT comp...,10.1016/j.molstruc.2007.11.019,PMC7094697,,els-covid,"Here, we report the surface-enhanced Raman sc...",2008-10-15,...,custom_license,https://doi.org/10.1016/j.molstruc.2007.11.019,11 years ago,False,False,False,False,"[report, surface-enhanced, raman, scattering, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## 5. Save Metadata

In [8]:
METADATA_PATH = PurePath(cord_support_dir() / 'Metadata.pq')
COLS = ['cord_uid', 'sha', 'source', 'title', 'doi', 'pmcid', 'pubmed_id',
       'published', 'has_pdf_parse',
       'has_pmc_xml_parse', 'full_text_file', 'url', 'document_vector']
metadata[COLS].to_parquet(METADATA_PATH)
METADATA_PATH_GZ = PurePath(cord_support_dir() / 'Metadata.csv.gz')
metadata[COLS].to_csv(METADATA_PATH_GZ)

## 6. Create Annoy Index

In [9]:
from annoy import AnnoyIndex
import random

SIMILARITY_INDEX_PATH = str((Path(cord_support_dir()) / 'PaperSimilarityNew.ann').resolve())

annoy_index = AnnoyIndex(DOCUMENT_VECTOR_LENGTH, 'angular')  
for i in range(len(metadata)):
    v = metadata.loc[i].document_vector
    annoy_index.add_item(i, v)

annoy_index.build(30) # 10 trees
annoy_index.save(SIMILARITY_INDEX_PATH)

True

In [10]:
metadata_lookup = pd.read_parquet(METADATA_PATH)