In [60]:
%load_ext autoreload
%autoreload 2
from cord.jsonpaper import load_json_texts
from cord import ResearchPapers
from cord.core import NONCOMM_USE_SUBSET, BIORXIV_MEDRXIV, lookup_by_sha
from cord.jsonpaper import json_cache_exists, load_json_cache
from cord.text import preprocess
import numpy as np
from IPython.display import display

In [4]:
catalog = NONCOMM_USE_SUBSET
papers = ResearchPapers.from_pickle()
metadata = papers.metadata

In [59]:
import time
tick = time.time()
for catalog in [NONCOMM_USE_SUBSET]:
    catalog_idx = metadata.full_text_file == catalog
    metadata_papers = metadata.loc[catalog_idx, ['sha', 'pmcid']].copy().reset_index()

    # Load the json catalog
    if json_cache_exists():
        json_papers = load_json_cache(catalog)
    else:
        json_papers = load_json_texts(json_dirs=catalog, tokenize=True)
    
    print('Json document tokens loaded from cache')
    # New since April 4th - some json files are in PMCXXXX.xml.json files so we need the PCMID
    json_papers['pmcid'] = json_papers.sha.str.extract('(PMC[0-9]+)\.xml')
    json_papers.loc[~json_papers.pmcid.isnull(), 'sha'] = np.nan
        
    sha_token_map = json_papers.loc[~json_papers.sha.isnull(),
                                        ['sha', 'index_tokens']].set_index('sha').to_dict()['index_tokens']
    # Create a dataframe with the same shape and index as the metadata papers.
    # The column index is the original index for the full metadata
    sha_token_df = metadata_papers.merge(json_papers.dropna(subset=['sha']), 
                                         how='left', on='sha').set_index('index')
    # Now lookup the index_tokens using the sha
    sha_token_df['index_tokens'] = sha_token_df.sha.apply(lambda sha: 
                                                          lookup_by_sha(sha, sha_token_map, not_found=np.nan))
    
    # Create the pmc lookup dict
    pmc_token_dict = json_papers.loc[~json_papers.pmcid.isnull(),
                                        ['pmcid', 'index_tokens']].set_index('pmcid').to_dict()['index_tokens']
    # Create a dataframe with the same shape and index as the metadata papers.
    pmc_token_df = metadata_papers.merge(json_papers.dropna(subset=['pmcid']), 
                                         how='left', on='pmcid').set_index('index')
    pmc_token_df['index_tokens'] = pmc_token_df.pmcid.apply(lambda sha:
                                                            lookup_by_sha(sha, pmc_token_dict, not_found=np.nan))

    # Now set the index tokens
    metadata.loc[catalog_idx, 'index_tokens'] = sha_token_df.index_tokens.fillna(pmc_token_df.index_tokens)
    
    # If the index tokens are still null .. use the abstracts
    null_tokens = catalog_idx & metadata.index_tokens.isnull()
    print('There are', null_tokens.sum(), 'papers in', catalog,
          'that cannot be indexed by text .. using abstracts')
    metadata.loc[null_tokens, 'index_tokens'] = metadata.loc[null_tokens].abstract.apply(preprocess)
    missing_index_tokens = len(metadata.loc[catalog_idx & metadata.index_tokens.isnull()])
    if missing_index_tokens > 0:
        print('There still are', missing_index_tokens,  'index tokens')
    
tock = time.time()
print('Completed', tock - tick)

Loading json cache files for noncomm_use_subset
Json document tokens loaded from cache
There are 7 papers in noncomm_use_subset that cannot be indexed by text .. using abstracts
Completed 8.89266586303711


In [None]:
papers = ResearchPapers.load(index='text')

In [67]:
df = load_json_texts(NONCOMM_USE_SUBSET)
df

Loading json from noncomm_use_subset


HBox(children=(FloatProgress(value=0.0, max=4470.0), HTML(value='')))

Unnamed: 0,sha,pmcid,text
0,0036b28fddf7e93da0970303672934ea2f9944e7,,\n\nhuman type 1 DM. The aim of this study was...
1,005c43980edf3fcc2a4d12ee7ad630ddb651ce6e,,Introduction\n\nAvian influenza (AI) viruses a...
2,00676f1131e03ca8defe523e79bc1635cc933909,,Introduction\n\nDromedary camels (Camelus drom...
3,006be99e337c84b8758591a54f0362353b24dfde,,\n\nAbstract. The half-time for secretion of t...
4,00a00d0edc750db4a0c299dd1ec0c6871f5a4f24,,Editorial\n\nThis is an Open Access article di...
...,...,...,...
4465,,PMC7093742,Introduction\n\nListeria monocytogenes is a gr...
4466,,PMC7097833,"Introduction\n\nOn January 9 2020, the World H..."
4467,,PMC7097845,"Introduction\n\nAs of Feb 5, 2020, more than 2..."
4468,,PMC7099174,


In [None]:
df.query("~pmcid.isnull()")