# Save JSON Cache File

This notebook processes the JSON files into index tokens and document vectors and save to cache files

In [1]:
%load_ext autoreload
%autoreload 2
from cord import ResearchPapers
from cord.core import find_data_dir, JSON_CATALOGS, cord_cache_dir, parallel
from cord.jsonpaper import load_json_texts, load_tokens_from_file, get_json_paths
from pathlib import Path, PurePath
from cord.text import preprocess
import pandas as pd
import numpy as np

## Steps

## 1. Load ResearchPapers and get metadata

In [2]:
papers = ResearchPapers.load()

Loading metadata from ..\data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 41.0 seconds


## 1. Create Index Tokens and Save

In [10]:
from cord.jsonpaper import get_token_df
from gensim.corpora import Dictionary

def save_dictionary(dictionary, save_path):
    print('Saving dictionary to', save_path)
    with save_path.open('wb') as f:
        dictionary.save(f)
        
def token_2ints(json_text_df):
    dictionary = Dictionary(json_text_df.index_tokens)
    json_text_df['token_int'] \
            = json_text_df.index_tokens.apply(lambda tokens:  [dictionary.token2id[t] for t in tokens])
    return dictionary, json_text_df.drop(columns=['index_tokens'])

def save_json_cache_files():
    json_cache_path = Path(find_data_dir()).parent / 'json-cache'
    if json_cache_path.exists():
        print('Json Cache dir exists')
        for cache_file in json_cache_path.glob('*.pq'):
            print('Removing', cache_file)
            cache_file.unlink()
    else:
        print('Creating directory', json_cache_path)
        json_cache_path.mkdir(exist_ok=True)

    max_df_size = 2000
    metadata = papers.metadata
    for catalog in JSON_CATALOGS:
        print('Creating tokens for', catalog)
        catalog_idx = metadata.full_text_file == catalog
        json_text_df = get_token_df(metadata.loc[catalog_idx], papers.data_path)
        dictionary, json_text_df = token_2ints(json_text_df)
        save_dictionary(dictionary, json_cache_path / f'jsoncache_{catalog}.dict' )
        catalog_save_path = json_cache_path / f'jsoncache_{catalog}.pq'
        print('Saving to', catalog_save_path)
        json_text_df.to_parquet(catalog_save_path)

In [11]:
save_json_cache_files()

Json Cache dir exists
Creating tokens for comm_use_subset


HBox(children=(FloatProgress(value=0.0, max=9593.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_comm_use_subset.dict
Saving to ..\data\json-cache\jsoncache_comm_use_subset.pq
Creating tokens for biorxiv_medrxiv


HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_biorxiv_medrxiv.dict
Saving to ..\data\json-cache\jsoncache_biorxiv_medrxiv.pq
Creating tokens for noncomm_use_subset


HBox(children=(FloatProgress(value=0.0, max=2365.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_noncomm_use_subset.dict
Saving to ..\data\json-cache\jsoncache_noncomm_use_subset.pq
Creating tokens for custom_license


HBox(children=(FloatProgress(value=0.0, max=35614.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_custom_license.dict
Saving to ..\data\json-cache\jsoncache_custom_license.pq
Creating tokens for arxiv


HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_arxiv.dict
Saving to ..\data\json-cache\jsoncache_arxiv.pq


## 2. Load Research Papers

In [12]:
papers = ResearchPapers.load(index='text')

Loading metadata from ..\data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the text contents of the papers
Loading json cache files for comm_use_subset
Loaded comm_use_subset json cache in 16 seconds
Loading json cache files for biorxiv_medrxiv
Loaded biorxiv_medrxiv json cache in 3 seconds
Loading json cache files for noncomm_use_subset
Loaded noncomm_use_subset json cache in 3 seconds
Loading json cache files for custom_license
Loaded custom_license json cache in 60 seconds
Loading json cache files for arxiv
Loaded arxiv json cache in 1 seconds
There are 12931 papers that will be indexed using the abstract instead of the contents
Finished indexing in 93 seconds


In [15]:
from cord.jsonpaper import load_json_cache

In [16]:
json_papers = load_json_cache('biorxiv_medrxiv')

Loading json cache files for biorxiv_medrxiv
Loaded biorxiv_medrxiv json cache in 2 seconds


In [17]:
json_papers

Unnamed: 0,cord_uid,index_tokens
0,vho70jcx,"[many, different, methods, characterize, mixtu..."
1,i9tbix2v,"[term, metapopulation, coined, levins, 1969, d..."
2,62gfisc6,"[antibodies, essential, components, immune, sy..."
3,058r9486,"[metagenomics, defined, analysis, collection, ..."
4,wich35l7,"[schematic, transmission, links, overlaid, phy..."
...,...,...
2610,xfoqd6uk,"[human, pathogenic, potential, zoonotic, infec..."
2611,cne5whf5,"[y4f, restored, following, vaccination, prolin..."
2612,xbfpgczp,"[recent, decades, many, new, flavi-like, virus..."
2613,3l8pm0t5,"[binge, eating, operationally, defined, consum..."


In [22]:
meta_df = papers.metadata[['cord_uid']].merge(json_papers, on='cord_uid', how='left')

In [46]:
#papers.metadata = papers.metadata.drop(columns=['index_tokens'])
papers.metadata['index_tokens'] = np.nan
for catalog in JSON_CATALOGS:
    catalog_idx = papers.metadata.full_text_file == catalog 
    metadata_papers = papers.metadata.loc[catalog_idx, ['cord_uid']].copy().reset_index()
    json_tokens = load_json_cache(catalog).set_index('cord_uid')
    token_lookup = json_tokens.to_dict()['index_tokens']
    papers.metadata.loc[catalog_idx, 'index_tokens'] = \
            papers.metadata.loc[catalog_idx, 'cord_uid'].apply(lambda c: token_lookup.get(c, np.nan))

Loading json cache files for comm_use_subset
Loaded comm_use_subset json cache in 18 seconds
Loading json cache files for biorxiv_medrxiv
Loaded biorxiv_medrxiv json cache in 2 seconds
Loading json cache files for noncomm_use_subset
Loaded noncomm_use_subset json cache in 3 seconds
Loading json cache files for custom_license
Loaded custom_license json cache in 63 seconds


In [48]:
papers.metadata.query("index_tokens.isnull()")

Unnamed: 0,cord_uid,sha,source,title,doi,pmcid,pubmed_id,license,abstract,published,...,has_pmc_xml_parse,full_text_file,url,when,covid_related,virus,coronavirus,sars,antivirals,index_tokens
207,28wrp74k,,PMC,SARS and Population Health Technology,10.2196/jmir.5.2.e14,PMC1550560,12857670,cc-by,The recent global outbreak of SARS (severe acu...,2003-06-30,...,False,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,17 years ago,False,False,False,True,,
208,8f0pn321,,PMC,The Internet as a Vehicle to Communicate Healt...,10.2196/jmir.6.1.e8,PMC1550585,15111274,cc-by,: The recent public health risks arising from ...,2004-03-03,...,False,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,16 years ago,False,False,False,True,,
786,g4puurhk,,PMC,Role of nitric oxide in management of acute re...,10.4103/1817-1737.41914,PMC2700444,19561888,cc-by,The current mortality rate of patients sufferi...,2008-01-01,...,False,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,12 years ago,False,False,False,False,,
787,ke0tkpso,,PMC,Avian influenza: The tip of the iceberg,10.4103/1817-1737.43085,PMC2700449,19561900,cc-by,"For some years now, we have been living with t...",2008-01-01,...,False,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,12 years ago,False,True,False,False,,
820,t579ysgl,,PMC,Reading chest radiographs in the critically il...,10.4103/1817-1737.53349,PMC2714572,19641649,cc-by,This is part II of two series review of readin...,2009-01-01,...,False,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,11 years ago,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57301,xbk1sfe6,,Elsevier,Chloroquine : pas d’efficacité sur le virus Ebola,10.1016/s1773-035x(15)30286-0,PMC7140263,,els-covid,Chloroquine : pas d’efficacité sur le virus Ebola,2015-11-30,...,False,custom_license,https://doi.org/10.1016/s1773-035x(15)30286-0,4 years ago,False,True,False,False,,
57317,j2p3ila5,,Elsevier,Corrigendum to “Geranylated flavonoids display...,10.1016/j.bmc.2013.09.005,PMC7134925,,els-covid,Corrigendum to “Geranylated flavonoids display...,2013-11-15,...,False,custom_license,https://doi.org/10.1016/j.bmc.2013.09.005,6 years ago,False,False,False,True,,
57327,f0imq00c,,Elsevier,WHO declares Beijing to be free of SARS,10.1016/s0140-6736(03)13790-7,PMC7134624,12846245,els-covid,WHO declares Beijing to be free of SARS,2003-06-28,...,False,custom_license,https://doi.org/10.1016/s0140-6736(03)13790-7,17 years ago,False,False,False,True,,
57328,8184gegh,,Elsevier,2 The morphology of virus particles. Classific...,10.1016/b978-0-407-00253-1.50007-4,PMC7151944,,els-covid,This chapter focuses on the studies on morph...,1983-12-31,...,False,custom_license,https://doi.org/10.1016/b978-0-407-00253-1.500...,36 years ago,False,True,True,False,,
