In [1]:
import os
import time
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
def data_reader(fname):
    with open(fname) as f:
        data = json.load(f)
    
    try:
        abstract = data['abstract'][0]['text'] + '\n\n'
    except:
        abstract = ''
    
    body = ''
    for item in data['body_text']:
        body += item['text']+'\n\n'
        
    text = abstract + body
    
    return data['paper_id'], text

In [3]:
def extract_data(dirs):
    data_df = pd.DataFrame(columns=['paper_id', 'body_text', 'source'])
    for j in dirs:
        print(j)
        flist = os.listdir(f"/Users/sachin/Data/Covid_19/{j}/{j}/")
        folder_name = f"/Users/sachin/Data/Covid_19/{j}/{j}/"
        
        paper_id_list = []
        paper_body_list = []

        for fname in tqdm(flist):
            paper_id, text = data_reader(folder_name+fname)
            paper_id_list.append(paper_id)
            paper_body_list.append(text)
        
        df = pd.DataFrame(list(zip(paper_id_list, paper_body_list)), columns=['paper_id', 'body_text'])
        df['source'] = j
        
        data_df = pd.concat([data_df, df])
        
    return data_df

In [4]:
dirs = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'pmc_custom_license', 'custom_license']

In [5]:
%%time
data_df = extract_data(dirs)

  6%|▌         | 55/885 [00:00<00:01, 540.98it/s]

biorxiv_medrxiv


100%|██████████| 885/885 [00:00<00:00, 930.87it/s]
  1%|          | 69/9118 [00:00<00:13, 674.92it/s]

comm_use_subset


100%|██████████| 9118/9118 [00:16<00:00, 540.73it/s]
  3%|▎         | 72/2353 [00:00<00:03, 715.51it/s]

noncomm_use_subset


100%|██████████| 2353/2353 [00:04<00:00, 577.27it/s]
  6%|▌         | 79/1426 [00:00<00:01, 786.59it/s]

pmc_custom_license


100%|██████████| 1426/1426 [00:02<00:00, 679.37it/s]
  0%|          | 54/16959 [00:00<00:31, 537.81it/s]

custom_license


100%|██████████| 16959/16959 [00:32<00:00, 520.46it/s]

CPU times: user 36 s, sys: 7.88 s, total: 43.9 s
Wall time: 56.7 s





In [6]:
output_csv = '/Users/sachin/Data/Covid_19/data.csv'
data_df.to_csv(output_csv, encoding='utf-8', index=False)

In [7]:
df = pd.read_csv('/Users/sachin/Data/Covid_19/data.csv')

In [8]:
df.head()

Unnamed: 0,paper_id,body_text,source
0,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,New anti-AIDS treatments must be continually d...,biorxiv_medrxiv
1,abcfffafab399149d4adadd6bb458c4994e2025d,Rationale: Several studies have estimated basi...,biorxiv_medrxiv
2,0cb9c296684ca5e71462d825cab2827854a01544,Summary Statement: DUX4 is thought to mediate ...,biorxiv_medrxiv
3,9bbfd3d34ee18ea1b9f4669331a6cee9c5992893,"emerged in late 2019 1,2 . Initial outbreaks i...",biorxiv_medrxiv
4,1218f278a4f8d83dac14b23c8f698062812ef9d5,A novel coronavirus (SARS-CoV-2) first detecte...,biorxiv_medrxiv


In [9]:
df.shape

(30741, 3)