# Load JSON Cache

In [1]:
from pathlib import Path, PurePath
from cord.core import find_data_dir, parallel, JSON_CATALOGS, BIORXIV_MEDRXIV, COMM_USE_SUBSET, CUSTOM_LICENSE
import pandas as pd
import pickle
from gensim.corpora.dictionary import Dictionary

In [3]:
json_cache_dir = Path(find_data_dir()).parent / 'json_cache'

def load_dictionary(catalog):
    dictionary_path = json_cache_dir / f'jsoncache_{catalog}.dict'
    dictionary = Dictionary.load((str(dictionary_path.resolve())))
    return dictionary

def load_json_cache(catalog):
    print('Loading json cache files for', catalog)
    json_cache_dir = Path(find_data_dir()).parent / 'json_cache'
    file_paths = [PurePath(p) for p in  json_cache_dir.glob(f'jsoncache_{catalog}*.pq')]
    if len(file_paths) == 1:
        json_cache = pd.read_parquet(file_paths[0])
    else:
        dfs = parallel(pd.read_parquet, file_paths)
        json_cache = pd.concat(dfs, ignore_index=True)
    dictionary:Dictionary = load_dictionary(catalog)
    json_cache['index_tokens'] \
        = json_cache.token_int.apply(lambda token_int: [dictionary[ti] for ti in token_int])
    return json_cache.drop(columns=['token_int'])

json_cache = load_json_cache(CUSTOM_LICENSE)

Loading json cache files for custom_license


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

In [47]:
sha_dfs = []
for catalog in JSON_CATALOGS:
    print('Loading from', catalog)
    df = load_json_cache(json_cache_dir, catalog).drop(columns=['index_tokens'])
    sha_dfs.append(df)
    
sha_df = pd.concat(sha_dfs, ignore_index=True)

Loading from comm_use_subset
Loading json cache files for comm_use_subset


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loading from biorxiv_medrxiv
Loading json cache files for biorxiv_medrxiv
Loading from noncomm_use_subset
Loading json cache files for noncomm_use_subset
Loading from custom_license
Loading json cache files for custom_license


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

In [67]:
SHA_LEN = 8
duplicated = sha_df.sha.apply(lambda sha: sha[:SHA_LEN]).duplicated(keep=False)
sha_df[duplicated]

Unnamed: 0,sha


In [57]:
len(sha_df)

33375

In [68]:
sha_df.sha.memory_usage(deep=True) / 1024

3161.6240234375

In [69]:
sha_df.sha.apply(lambda sha: sha[:SHA_LEN]).memory_usage(deep=True) / 1024

2118.6552734375