# Create Annoy Index

In [1]:
%load_ext autoreload
%autoreload 2
from cord.core import cord_support_dir, DOCUMENT_VECTOR_LENGTH, DOCUMENT_VECTOR_PATH, lookup_by_sha, SIMILARITY_INDEX_PATH
from cord import ResearchPapers
import pandas as pd
import numpy as np
from pathlib import Path, PurePath

## 1. Load ResearchPapers

In [2]:
research_papers = ResearchPapers.load()

Loading metadata from ..\data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 33.0 seconds


In [3]:
metadata = research_papers.metadata.copy()

## 2. Load Document Vectors

In [4]:
document_vectors = pd.read_parquet(DOCUMENT_VECTOR_PATH)

In [7]:
document_vectors.loc[0].document_vector_2d                                              

15.698082

## 3. Create a Document Vector Map

In [5]:
sha_docvec_map = document_vectors[['sha', 'document_vector']].set_index('sha').to_dict()['document_vector']
pmc_docvec_map = document_vectors[['pmcid', 'document_vector']].set_index('pmcid').to_dict()['document_vector']

## 4. Set Document Vectors

In [6]:
EMPTY_VECTOR = np.zeros(DOCUMENT_VECTOR_LENGTH, dtype=np.float32)
has_sha = ~metadata.sha.isnull()
has_pmcid = ~metadata.pmcid.isnull()
sha_doc_vectors = metadata.sha.apply(lambda sha: lookup_by_sha(sha, sha_docvec_map, not_found=np.nan))
pmc_doc_vectors = metadata.pmcid.apply(lambda pmcid: lookup_by_sha(pmcid, pmc_docvec_map, not_found=np.nan))

# Set the document vector from the SHA
metadata['document_vector'] = metadata.sha.apply(lambda sha: 
                                        lookup_by_sha(sha, sha_docvec_map, not_found=EMPTY_VECTOR))
# Set the remaining document vectors from the PMCID
remaining_idxs = (~has_sha) & (has_pmcid)
metadata.loc[remaining_idxs, 'document_vector'] \
                = metadata.loc[remaining_idxs].pmcid.apply(lambda pmcid: 
                                             lookup_by_sha(pmcid, pmc_docvec_map, not_found=EMPTY_VECTOR))

In [16]:
#metadata.loc[remaining_idxs]
metadata.document_vecto

ValueError: all input arrays must have the same shape

## 5. Save Metadata

In [9]:
METADATA_PATH = PurePath(cord_support_dir() / 'Metadata.pq')
COLS = ['cord_uid', 'sha', 'source', 'title', 'doi', 'pmcid', 'pubmed_id',
       'published', 'has_pdf_parse',
       'has_pmc_xml_parse', 'full_text_file', 'url', 'document_vector']
#metadata[COLS].to_parquet(METADATA_PATH)
METADATA_PATH_GZ = PurePath(cord_support_dir() / 'Metadata.csv.gz')
metadata[COLS].to_csv(METADATA_PATH_GZ)

## 6. Create Annoy Index

In [10]:
from annoy import AnnoyIndex
import random

SIMILARITY_INDEX_PATH = str((Path(cord_support_dir()) / 'PaperSimilarityNew.ann').resolve())

annoy_index = AnnoyIndex(DOCUMENT_VECTOR_LENGTH, 'angular')  
for i in range(len(metadata)):
    v = metadata.loc[i].document_vector
    annoy_index.add_item(i, v)

annoy_index.build(30) # 10 trees
annoy_index.save(SIMILARITY_INDEX_PATH)

IndexError: Vector has wrong length (expected 20, got 241)

In [None]:
metadata_lookup = pd.read_parquet(METADATA_PATH)