# Save JSON Cache File

This notebook processes the JSON files into index tokens and document vectors and save to cache files

In [1]:
%load_ext autoreload
%autoreload 2
from cord import ResearchPapers
from cord.core import find_data_dir, JSON_CATALOGS, cord_cache_dir, parallel
from cord.jsonpaper import load_tokens_from_file, get_json_paths
from pathlib import Path, PurePath
from cord.text import preprocess
import pandas as pd
import numpy as np
pd.options.display.max_columns = 30

## Steps

## 1. Load ResearchPapers and get metadata

In [2]:
papers = ResearchPapers.load()

Loading metadata from ..\data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 70.0 seconds


## 1. Create Index Tokens and Save

In [11]:
from cord.jsonpaper import get_token_df
from cord.core import split_df
from gensim.corpora import Dictionary

def save_dictionary(dictionary, save_path):
    print('Saving dictionary to', save_path)
    with save_path.open('wb') as f:
        dictionary.save(f)
        
def token_2ints(json_text_df):
    dictionary = Dictionary(json_text_df.index_tokens)
    json_text_df['token_int'] \
            = json_text_df.index_tokens.apply(lambda tokens:  [dictionary.token2id[t] for t in tokens])
    return dictionary, json_text_df.drop(columns=['index_tokens'])

def save_json_cache_files():
    json_cache_path = Path(find_data_dir()).parent / 'json-cache'
    if json_cache_path.exists():
        print('Json Cache dir exists')
        for cache_file in json_cache_path.glob('*.*'):
            print('Removing', cache_file)
            cache_file.unlink()
        for dict_file in json_cache_path.glob('*.dict'):
            print('Removing', dict_file)
            dict_file.unlink()
    else:
        print('Creating directory', json_cache_path)
        json_cache_path.mkdir(exist_ok=True)

    metadata_parts = split_df(papers.metadata, 15000)
    for i, df in enumerate(metadata_parts):
        json_text_df = get_token_df(df, papers.data_path)
        dictionary, json_text_df = token_2ints(json_text_df)
        save_dictionary(dictionary, json_cache_path / f'jsoncache_{i}.dict' )
        catalog_save_path = json_cache_path / f'jsoncache_{i}.pq'
        print('Saving to', catalog_save_path)
        json_text_df.to_parquet(catalog_save_path)

In [12]:
save_json_cache_files()

Json Cache dir exists


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_0.dict
Saving to ..\data\json-cache\jsoncache_0.pq


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_1.dict
Saving to ..\data\json-cache\jsoncache_1.pq


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_2.dict
Saving to ..\data\json-cache\jsoncache_2.pq


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_3.dict
Saving to ..\data\json-cache\jsoncache_3.pq


HBox(children=(FloatProgress(value=0.0, max=3571.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_4.dict
Saving to ..\data\json-cache\jsoncache_4.pq


In [22]:
from cord.jsonpaper import get_json_cache_dir
import time
def index_from_jsoncache(metadata):
    json_cache_dir = get_json_cache_dir()
    file_paths = [PurePath(p) for p in json_cache_dir.glob(f'jsoncache_*.pq')]
    tick = time.time()
    for cache_path in file_paths:
        print('Loading json cache file', cache_path.stem)
        json_cache = pd.read_parquet(cache_path)
        part_no = cache_path.stem[len('jsoncache_'):]
        dictionary_path = json_cache_dir / f'jsoncache_{part_no}.dict'
        dictionary = Dictionary.load((str(dictionary_path.resolve())))
        json_cache['index_tokens'] \
                    = json_cache.token_int.apply(lambda token_int: [dictionary[ti] for ti in token_int])
        json_tokens = json_cache.drop(columns=['token_int'])
        token_lookup = json_tokens.to_dict()['index_tokens']
        need_tokens = metadata.index_tokens.isnull()
        metadata.loc[need_tokens, 'index_tokens'] = \
            metadata.loc[need_tokens, 'cord_uid'].apply(lambda c: token_lookup.get(c, np.nan))
        
    tock = time.time()
    print('Loaded json cache in', int(tock - tick), 'seconds')
        
index_from_jsoncache(papers.metadata)

Loading json cache file jsoncache_0
Loading json cache file jsoncache_1
Loading json cache file jsoncache_2
Loading json cache file jsoncache_3
Loading json cache file jsoncache_4
Loaded json cache in 83 seconds


## 2. Load Research Papers

In [40]:
papers = ResearchPapers.load(index='text')

Loading metadata from ..\data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the text contents of the papers
Loading json cache file jsoncache_0
Loading json cache file jsoncache_1
Looking for 48777 tokens
Loading json cache file jsoncache_2
Looking for 39333 tokens
Loading json cache file jsoncache_3
Looking for 30190 tokens
Loading json cache file jsoncache_4
Looking for 16918 tokens
There are 13688 papers that will be indexed using the abstract instead of the contents
Finished indexing in 92 seconds
