# Load modules

In [1]:
import os
import json
from pprint import pprint
import pandas as pd

# Import data
As the dataset is very large, we will begin by exploring the `biorxiv` subset, which is stored in JSON format.

In [2]:
data_dir = 'Data/pdf_json/'
filenames = os.listdir(data_dir)
print(f'Number of scientific papers in subset: {len(filenames)}')

Number of scientific papers in subset: 1625


# Explore data

To explore the structure of the JSON files, we'll begin looking at the first paper in the data. There are seven keys in the data which contain the content of each scientific paper.

In [3]:
example_json = json.load(open(data_dir + filenames[0], 'rb'))
print(f'Structure of paper: {type(example_json)}')
print(f'Keys of dictionary: {example_json.keys()}')

Structure of paper: <class 'dict'>
Keys of dictionary: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


## paper_id
A unique ID made of a combination of alphanumeric characters.

In [4]:
example_json['paper_id']

'0015023cc06b5362d332b3baf348d11567ca2fbb'

## metadata
The value of the `metadata` key is a further dictionary with the following keys: 

In [5]:
print(f"Class of metadata value: {type(example_json['metadata'])}")
print(f"Keys of dictionary: {example_json['metadata'].keys()}")

Class of metadata value: <class 'dict'>
Keys of dictionary: dict_keys(['title', 'authors'])


In [6]:
pprint(example_json['metadata'])

{'authors': [{'affiliation': {},
              'email': '',
              'first': 'Joseph',
              'last': 'Ward',
              'middle': ['C'],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Lidia',
              'last': 'Lasecka-Dykes',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Chris',
              'last': 'Neil',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Oluwapelumi',
              'last': 'Adeyemi',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': 'Sarah',
              'last': '',
              'middle': [],
              'suffix': ''},
             {'affiliation': {},
              'email': '',
              'first': '',


## abstract
The value of the `abstract` key is a list with the following structure. The length of the list may vary between papers.

In [7]:
print(f"Class of abstract value: {type(example_json['abstract'])}")
print(f"Length of list: {len(example_json['abstract'])}")

Class of abstract value: <class 'list'>
Length of list: 2


In [8]:
pprint(example_json['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'word count: 194 22 Text word count: 5168 23 24 25 author/funder. '
          'All rights reserved. No reuse allowed without permission. Abstract '
          '27 The positive stranded RNA genomes of picornaviruses comprise a '
          'single large open reading 28 frame flanked by 5′ and 3′ '
          'untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 '
          'has an unusually large 5′ UTR (1.3 kb) containing five structural '
          'domains. These include the 30 internal ribosome entry site (IRES), '
          'which facilitates initiation of translation, and the cis-acting 31 '
          'replication element (cre). Less well characterised structures are a '
          '5′ terminal 360 nucleotide 32 stem-loop, a variable length '
          'poly-C-tract of approximately 100-200 nucleotides and a series of '
          '33 two to four tandemly repeated pseudoknots (PKs). We investigated

## body_text
The value of the `body_text` key is a list with the following structure. The length of the list may vary between papers.

In [9]:
pprint(example_json['body_text'][0])

{'cite_spans': [],
 'ref_spans': [{'end': 360,
                'ref_id': 'FIGREF50',
                'start': 351,
                'text': 'figure 1A'}],
 'section': '',
 'text': 'VP3, and VP0 (which is further processed to VP2 and VP4 during virus '
         'assembly) (6). The P2 64 and P3 regions encode the non-structural '
         'proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural '
         'protein-coding region is replaced by reporter genes, allow the study '
         'of genome 68 replication without the requirement for high '
         'containment (9, 10) ( figure 1A ).'}


## bib_entries / ref_entries / back_matter

In [10]:
pprint(example_json['bib_entries'])

{'BIBREF0': {'authors': [{'first': 'T',
                          'last': 'Jackson',
                          'middle': [],
                          'suffix': ''},
                         {'first': 'T',
                          'last': 'Tuthill',
                          'middle': ['J'],
                          'suffix': ''},
                         {'first': 'D',
                          'last': 'Rowlands',
                          'middle': ['J'],
                          'suffix': ''},
                         {'first': 'N',
                          'last': 'Stonehouse',
                          'middle': ['J'],
                          'suffix': ''}],
             'issn': '',
             'other_ids': {},
             'pages': '',
             'ref_id': 'b0',
             'title': 'Genetic economy in 598 picornaviruses: Foot-and-mouth '
                      'disease virus replication exploits alternative '
                      'precursor 599 cleavage pathways',
    

In [11]:
pprint(example_json['ref_entries'])

{'FIGREF0': {'latex': None,
             'text': 'and-mouth disease virus (FMDV) is a single stranded '
                     'positive sense RNA virus of the 45 genus Aphthovirus in '
                     'the family Picornaviridae. It occurs as seven, '
                     'antigenically diverse 46 serotypes; A, O, C, Asia 1, '
                     'South African Territories (SAT) 1, 2 and 3. It is the '
                     'causative agent 47 of foot-and-mouth disease (FMD), a '
                     'highly contagious disease of cloven-hooved animals 48 '
                     'affecting most notably cattle, pigs, sheep and goats in '
                     'addition to wild species such as the 49 African buffalo. '
                     'Disease outbreaks have serious economic implications '
                     'resulting from trade 50 restrictions, reduced '
                     'productivity and the slaughter of infected and at-risk '
                     'animals (1). The 51 2001 

In [12]:
pprint(example_json['back_matter'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'annex',
  'text': 'author/funder. All rights reserved. No reuse allowed without '
          'permission.The copyright holder for this preprint (which was not '
          'peer-reviewed) is the The copyright holder for this preprint (which '
          'was not peer-reviewed) is the . '
          'https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint'}]


# Compile data

In [13]:
papers=[]
for filename in filenames[:10]:
    file = data_dir + filename
    paper = json.load(open(file, 'rb'))
    body_texts = [body_text['text'] for body_text in paper['body_text']]
    text_joined=''
    for text in body_texts:
        text_joined += text
    papers.append(text_joined)

In [14]:
clean_df = pd.DataFrame(papers)
clean_df.head()

Unnamed: 0,0
0,"VP3, and VP0 (which is further processed to VP..."
1,"In December 2019, a novel coronavirus, SARS-Co..."
2,The 2019-nCoV epidemic has spread across China...
3,"Metagenomic sequencing, which allows us to dir..."
4,"Infectious bronchitis (IB), which is caused by..."
