In [2]:
import os
import time
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
def data_reader(fname):
    with open(fname) as f:
        data = json.load(f)
    
    try:
        abstract = data['abstract'][0]['text'] + '\n\n'
    except:
        abstract = ''
    
    body = ''
    for item in data['body_text']:
        body += item['text']+'\n\n'
        
    text = abstract + body
    
    return data['paper_id'], text

In [4]:
def extract_data(dirs):
    data_df = pd.DataFrame(columns=['paper_id', 'body_text', 'source'])
    for j in dirs:
        print(j)
        flist = os.listdir(f"/Users/sachin/Data/Covid_19/{j}/{j}/")
        folder_name = f"/Users/sachin/Data/Covid_19/{j}/{j}/"
        
        paper_id_list = []
        paper_body_list = []

        for fname in tqdm(flist):
            paper_id, text = data_reader(folder_name+fname)
            paper_id_list.append(paper_id)
            paper_body_list.append(text)
        
        df = pd.DataFrame(list(zip(paper_id_list, paper_body_list)), columns=['paper_id', 'body_text'])
        df['source'] = j
        
        data_df = pd.concat([data_df, df])
        
    return data_df

In [5]:
dirs = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'pmc_custom_license']

In [7]:
data_df = extract_data(dirs)

 17%|█▋        | 137/803 [00:00<00:00, 1369.12it/s]

biorxiv_medrxiv


100%|██████████| 803/803 [00:00<00:00, 1443.36it/s]
  1%|          | 103/9000 [00:00<00:08, 1028.23it/s]

comm_use_subset


100%|██████████| 9000/9000 [00:10<00:00, 895.76it/s] 
  3%|▎         | 65/1973 [00:00<00:02, 647.75it/s]

noncomm_use_subset


100%|██████████| 1973/1973 [00:02<00:00, 709.03it/s] 
 12%|█▏        | 167/1426 [00:00<00:00, 1663.60it/s]

pmc_custom_license


100%|██████████| 1426/1426 [00:01<00:00, 1031.00it/s]


In [13]:
output_csv = '/Users/sachin/Data/Covid_19/data.csv'
data_df.to_csv(output_csv, encoding='utf-8', index=False)

In [14]:
df = pd.read_csv('/Users/sachin/Data/Covid_19/data.csv')

In [15]:
df.head()

Unnamed: 0,paper_id,body_text,source
0,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,New anti-AIDS treatments must be continually d...,biorxiv_medrxiv
1,abcfffafab399149d4adadd6bb458c4994e2025d,Rationale: Several studies have estimated basi...,biorxiv_medrxiv
2,0cb9c296684ca5e71462d825cab2827854a01544,Summary Statement: DUX4 is thought to mediate ...,biorxiv_medrxiv
3,9bbfd3d34ee18ea1b9f4669331a6cee9c5992893,"emerged in late 2019 1,2 . Initial outbreaks i...",biorxiv_medrxiv
4,1218f278a4f8d83dac14b23c8f698062812ef9d5,A novel coronavirus (SARS-CoV-2) first detecte...,biorxiv_medrxiv


In [16]:
df.shape

(13202, 3)