# Save JSON Cache File

This notebook processes the JSON files into index tokens and document vectors and save to cache files

In [5]:
%load_ext autoreload
%autoreload 2
from cord import ResearchPapers
from cord.core import find_data_dir, JSON_CATALOGS, cord_cache_dir
from cord.jsonpaper import load_json_texts
from pathlib import Path, PurePath
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Functions

## 1. Create Index Tokens and Save

In [12]:
from gensim.corpora.dictionary import Dictionary
import pickle

def save_dictionary(dictionary, save_path):
    print('Saving dictionary to', save_path)
    with save_path.open('wb') as f:
        dictionary.save(f)
        
def token_2ints(json_text_df):
    dictionary = Dictionary(json_text_df.index_tokens)
    json_text_df['token_int'] \
            = json_text_df.index_tokens.apply(lambda tokens:  [dictionary.token2id[t] for t in tokens])
    return dictionary, json_text_df.drop(columns=['index_tokens'])


def save_json_cache_files():
    json_cache_path = Path(find_data_dir()).parent / 'json-cache'
    if json_cache_path.exists():
        print('Json Cache dir exists')
        for cache_file in json_cache_path.glob('*.pq'):
            print('Removing', cache_file)
            cache_file.unlink()
    else:
        print('Creating directory', json_cache_path)
        json_cache_path.mkdir(exist_ok=True)

    max_df_size = 2000
    for catalog in JSON_CATALOGS:
        json_text_df = load_json_texts(catalog, tokenize=True)
        dictionary, json_text_df = token_2ints(json_text_df)
        # Don't use the authors column
        json_text_df = json_text_df
        save_dictionary(dictionary, json_cache_path / f'jsoncache_{catalog}.dict' )
        #if catalog in [BIORXIV_MEDRXIV, NONCOMM_USE_SUBSET]:
        catalog_save_path = json_cache_path / f'jsoncache_{catalog}.pq'
        print('Saving to', catalog_save_path)
        json_text_df.to_parquet(catalog_save_path)

In [13]:
save_json_cache_files()

Json Cache dir exists
Loading json from comm_use_subset


HBox(children=(FloatProgress(value=0.0, max=18746.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_comm_use_subset.dict
Saving to ..\data\json-cache\jsoncache_comm_use_subset.pq
Loading json from biorxiv_medrxiv


HBox(children=(FloatProgress(value=0.0, max=1934.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_biorxiv_medrxiv.dict
Saving to ..\data\json-cache\jsoncache_biorxiv_medrxiv.pq
Loading json from noncomm_use_subset


HBox(children=(FloatProgress(value=0.0, max=4678.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_noncomm_use_subset.dict
Saving to ..\data\json-cache\jsoncache_noncomm_use_subset.pq
Loading json from custom_license


HBox(children=(FloatProgress(value=0.0, max=35238.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_custom_license.dict
Saving to ..\data\json-cache\jsoncache_custom_license.pq


## 2. Load Research Papers

In [2]:
papers = ResearchPapers.load(index='text')

Loading metadata from ..\data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the text contents of the papers
Loading json cache files for comm_use_subset
Loaded comm_use_subset json cache in 35 seconds
Json document tokens loaded from cache
Loading json cache files for biorxiv_medrxiv
Loaded biorxiv_medrxiv json cache in 2 seconds
Json document tokens loaded from cache
Loading json cache files for noncomm_use_subset
Loaded noncomm_use_subset json cache in 7 seconds
Json document tokens loaded from cache
Loading json cache files for custom_license
Loaded custom_license json cache in 67 seconds
Json document tokens loaded from cache
There are 13374 papers that will be indexed using the abstract instead of the contents
Finished indexing in 121 seconds


In [3]:
papers.describe()

Unnamed: 0,non-null,null,unique,duplicate,most common
cord_uid,52398,0,52365,33,4fbr8fx8
source,52398,0,6,52392,PMC
title,52398,0,49482,2916,
doi,52398,0,49043,3355,
pmcid,43652,8746,43652,0,PMC7134875
pubmed_id,38058,14340,38031,27,15161626
license,52398,0,13,52385,els-covid
abstract,52398,0,50533,1865,
published,52390,8,6484,45906,2020-01-01 00:00:00
authors,52398,0,47805,4593,


In [7]:
#import joblib

#bm25_savepath = cord_cache_dir()/ 'BM25.gz'
#with bm25_savepath.open('wb') as f:
#    joblib.dump(papers.bm25, f, compress=('gzip', 9))