In [11]:
import pandas as pd
import numpy as np
import json
import os
from pprint import pprint

In [2]:
!ls

CORD-19-research-challenge.zip [1m[36mcustom_license[m[m
COVID.DATA.LIC.AGMT.pdf        json_schema.txt
Data Loading.ipynb             metadata.csv
[1m[36mbiorxiv_medrxiv[m[m                metadata.readme
[1m[36mcomm_use_subset[m[m                [1m[36mnoncomm_use_subset[m[m


### Metadata

In [3]:
#read in metadata
root_dir = './CORD_DATA/'
metadata_path = root_dir + 'metadata.csv'
metadata = pd.read_csv(metadata_path)

metadata.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,Samuel Minot; Stephen D Turner; Krista L Ternu...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/001727
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,Lin WANG; Xiang Li,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/003889
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/006866
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,Sofia Morfopoulou; Vincent Plagnol,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/007476
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,Stephen P Velsko; Jonathan E Allen,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/010389


### Read Data Into Dataframes

In [63]:
'''
    @Desc    : Reads in json article and converts into Pandas Dataframe
    @Params  : filepath (str)
    @Returns : Pandas Dataframe 
'''
def JsonToDataFrame(filepath):
        
    #read json into dict
    with open(filepath) as json_data:
        data = json.load(json_data)
        
        final_data = {
            'paper_id'  : [data['paper_id']],
            'abstract'  : ['\n'.join([section['text'] for section in data['abstract']])],
            'body_text' : ['\n'.join([section['text'] for section in data['body_text']])],                                                  
        }
        
        return pd.DataFrame.from_dict(final_data)
    
        

In [64]:
#DATA DIRECTORIES
        
biorxiv_medrxiv    = root_dir + 'biorxiv_medrxiv/biorxiv_medrxiv/'
comm_use_subset    = root_dir + 'comm_use_subset/comm_use_subset/'
noncomm_use_subset = root_dir + 'noncomm_use_subset/noncomm_use_subset/'

biorxiv_medrxiv_files       = [biorxiv_medrxiv + pos_json for pos_json in os.listdir(biorxiv_medrxiv) if pos_json.endswith('.json')]
comm_use_subset_files       = [comm_use_subset + pos_json for pos_json in os.listdir(comm_use_subset) if pos_json.endswith('.json')]
noncomm_use_subset_files    = [noncomm_use_subset + pos_json for pos_json in os.listdir(noncomm_use_subset) if pos_json.endswith('.json')]

pprint(biorxiv_medrxiv_files[:10])

['./CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/f905f78b32f63c6d14a79984dfb33f1b358b8ab4.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/abcfffafab399149d4adadd6bb458c4994e2025d.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/0cb9c296684ca5e71462d825cab2827854a01544.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/9bbfd3d34ee18ea1b9f4669331a6cee9c5992893.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/1218f278a4f8d83dac14b23c8f698062812ef9d5.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/a59906b732bf4a489e282c3e4f499d4166c622e7.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/baabfb35a321ea12028160e0d2c1552a2fda2dd5.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/78ab35770cc98632c434523df48c6ba290182d37.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/21100aba41a4bfb48d7dc37f1bf5dbb38bf3867a.json',
 './CORD_DATA/biorxiv_medrxiv/biorxiv_medrxiv/7852aafdfb9e59e6af78a47af796325434f8922a.json']


In [None]:
#initialize dfs
biomed_df      = pd.DataFrame()
comm_use_df    = pd.DataFrame()
noncomm_use_df = pd.DataFrame()

#read biomed data
for f in biorxiv_medrxiv_files:
    df = JsonToDataFrame(f)
    biomed_df = biomed_df.append(df, ignore_index=True)

#read commonly used data
for f in comm_use_subset_files:
    df = JsonToDataFrame(f)
    comm_use_df = comm_use_df.append(df, ignore_index=True)

#read non-commonly used data
for f in noncomm_use_subset_files:
    df = JsonToDataFrame(f)
    noncomm_use_df = noncomm_use_df.append(df, ignore_index=True)

    

In [None]:
biomed_df.head()