# Save JSON Cache File

This notebook processes the JSON files into index tokens and document vectors and save to cache files

In [1]:
%load_ext autoreload
%autoreload 2
from cord import ResearchPapers
import pandas as pd
from cord.core import split_df, DOCUMENT_VECTOR_PATH
from cord.jsonpaper import load_json_texts, list_json_files_in, load_json_cache
from cord.core import JSON_CATALOGS, BIORXIV_MEDRXIV, CUSTOM_LICENSE, \
    COMM_USE_SUBSET, NONCOMM_USE_SUBSET, find_data_dir, cord_support_dir
from cord.text import SIMPLE_STOPWORDS
from pathlib import Path, PurePath
import pandas as pd

pd.options.display.max_colwidth = 1200
pd.options.display.max_rows = 100

## Functions

## 1. Load Document Vectors

In [2]:
document_vectors = pd.read_parquet(DOCUMENT_VECTOR_PATH)

## 2. Create Index Tokens and Save

In [3]:
from gensim.corpora.dictionary import Dictionary
import pickle

def save_dictionary(dictionary, save_path):
    print('Saving dictionary to', save_path)
    with save_path.open('wb') as f:
        dictionary.save(f)
        
def token_2ints(json_text_df):
    dictionary = Dictionary(json_text_df.index_tokens)
    json_text_df['token_int'] \
            = json_text_df.index_tokens.apply(lambda tokens:  [dictionary.token2id[t] for t in tokens])
    return dictionary, json_text_df.drop(columns=['index_tokens'])


def save_json_cache_files():
    json_cache_path = Path(find_data_dir()).parent / 'json-cache'
    if json_cache_path.exists():
        print('Json Cache dir exists')
        for cache_file in json_cache_path.glob('*.pq'):
            print('Removing', cache_file)
            cache_file.unlink()
    else:
        print('Creating directory', json_cache_path)
        json_cache_path.mkdir(exist_ok=True)

    max_df_size = 2000
    for catalog in JSON_CATALOGS:
        json_text_df = load_json_texts(catalog, tokenize=True)
        dictionary, json_text_df = token_2ints(json_text_df)
        # Don't use the authors column
        json_text_df = json_text_df
        save_dictionary(dictionary, json_cache_path / f'jsoncache_{catalog}.dict' )
        #if catalog in [BIORXIV_MEDRXIV, NONCOMM_USE_SUBSET]:
        catalog_save_path = json_cache_path / f'jsoncache_{catalog}.pq'
        print('Saving to', catalog_save_path)
        json_text_df.to_parquet(catalog_save_path)
    """

        else:
            print('Splitting files for', catalog)
            json_text_parts = split_df(json_text_df, max_df_size)
            for i in range(0, len(json_text_parts)):
                df = json_text_parts[i]
                catalog_save_path = json_cache_path / f'jsoncache_{catalog}_{i}.pq'
                print('Saving to', catalog_save_path)
                df.to_parquet(catalog_save_path)
    """

In [4]:
save_json_cache_files()

Json Cache dir exists
Removing ..\data\json-cache\jsoncache_biorxiv_medrxiv.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_0.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_1.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_2.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_3.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_4.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_5.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_6.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_7.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_8.pq
Removing ..\data\json-cache\jsoncache_comm_use_subset_9.pq
Removing ..\data\json-cache\jsoncache_custom_license_0.pq
Removing ..\data\json-cache\jsoncache_custom_license_1.pq
Removing ..\data\json-cache\jsoncache_custom_license_10.pq
Removing ..\data\json-cache\jsoncache_custom_license_11.pq
Removing ..\data\json-cache\jsoncache_custom_license_12.pq
Removing ..\data\json-cache\jsoncache_

HBox(children=(FloatProgress(value=0.0, max=18360.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_comm_use_subset.dict
Saving to ..\data\json-cache\jsoncache_comm_use_subset.pq
Loading json from biorxiv_medrxiv


HBox(children=(FloatProgress(value=0.0, max=1342.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_biorxiv_medrxiv.dict
Saving to ..\data\json-cache\jsoncache_biorxiv_medrxiv.pq
Loading json from noncomm_use_subset


HBox(children=(FloatProgress(value=0.0, max=4470.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_noncomm_use_subset.dict
Saving to ..\data\json-cache\jsoncache_noncomm_use_subset.pq
Loading json from custom_license


HBox(children=(FloatProgress(value=0.0, max=27925.0), HTML(value='')))

Saving dictionary to ..\data\json-cache\jsoncache_custom_license.dict
Saving to ..\data\json-cache\jsoncache_custom_license.pq


In [21]:
papers = ResearchPapers.load(index='text')

Loading metadata from ..\data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the text contents of the papers
Loading json cache files for comm_use_subset


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Loaded json cache in 31.0 seconds
Json document tokens loaded from cache
Loading json cache files for biorxiv_medrxiv
Loaded json cache in 1.0 seconds
Json document tokens loaded from cache
Loading json cache files for noncomm_use_subset
Loaded json cache in 12.0 seconds
Json document tokens loaded from cache
Loading json cache files for custom_license


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

Loaded json cache in 56.0 seconds
Json document tokens loaded from cache
There are 13015 papers that will be indexed using abstract instead of textx
Finished indexing in 107 seconds
