In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import yaml
from itertools import chain
import os

In [2]:
def get_file_names(path):
    # get list of papers .json from path
    file_names = []
    for file_name in os.listdir(path):
        file_names.append(file_name)
    return file_names

# Scraper

Scraper to fetch article data online based on article information

In [11]:
# init session
session = requests.session()

In [4]:
# import configurations
with open('config.yaml','r') as ymlfile:
    cfg = yaml.load(ymlfile)

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
# load metadata
df_meta = pd.read_csv('metadata.csv')

In [13]:
sources = df_meta['source_x'].unique()
for source in sources:
    #print(source)
    papers_df = df_meta[df_meta['source_x']==source][['sha','doi']]
    papers_array = papers_df.to_numpy()
    print('There are {} paper ids from {}.'.format(len(papers_array),source))
    
    # get papers relative to source

There are 19471 paper ids from Elsevier.
There are 153 paper ids from CZI.
There are 1154 paper ids from WHO.
There are 22309 paper ids from PMC.
There are 83 paper ids from PMC_new.
There are 605 paper ids from biorxiv.
There are 445 paper ids from medrxiv.


In [14]:
df_meta.groupby(['source_x'])['journal'].count()

source_x
CZI           153
Elsevier    19409
PMC         12425
PMC_new        82
WHO          1104
biorxiv         0
medrxiv         0
Name: journal, dtype: int64

## Scrape papers from bioRxiv

In [10]:
def scrape_biorxiv(paper):
    print('Fetching paper data at {}'.format(paper['url']))
    html = session.get(paper['url'])
    tmp_soup = BeautifulSoup(html.content, 'html.parser')
    paper_webdata = biorxiv_html_paper_data_extractor(tmp_soup)
    paper = dict(chain(paper.items(),paper_webdata.items()))
    return paper

In [18]:
def get_biorxiv_authors(authors_html):
    ''' HTML list of authors
    '''
    authors = []
    for author in authors_html:
        try:
            given_name = author.find('span',attrs={'class':'nlm-given-names'}).text
        except:
            given_name = ''
        try:
            surname = author.find('span',attrs={'class':'nlm-surname'}).text
        except:
            surname = ''
        authors.append(given_name + ' ' + surname)
    return authors

In [7]:
def get_biorxiv_references(references_html):
    ''' HTML list of references
    soup.find('div',attrs={'class':'section ref-list'}).find_all('li')
    '''
    references = []
    for ref in references_html:
        reference = {}
        # label
        try:
            reference['label'] = ref.find('span',attrs={'class':'ref-label'}).text
        except:
            reference['label'] = None

        # authors
        author_names = []
        try:
            authors = ref.find('cite').find_all('span',attrs={'class':'cit-auth'})
            for author in authors:
                surname = author.find('span',attrs={'class':'cit-name-surname'}).text
                given_names = author.find('span',attrs={'class':'cit-name-given-names'}).text
                author_name = surname + ' ' + given_names
                author_names.append(author_name)
            reference['authors'] = author_names
        except:
            reference['authors'] = author_names

        # date
        try:
            reference['date'] = ref.find('cite').find('span',attrs={'class':'cit-pub-date'}).text
        except:
            reference['date'] = None

        # title
        try:
            reference['title'] = ref.find('cite').find('span',attrs={'class':'cit-article-title'}).text
        except:
            reference['title'] = ''

        # links
        ref_links = []
        try:
            links = ref.find('div',attrs={'class':'cit-extra'}).find_all('a')
            for link in links:
                if link.text != 'OpenUrl':
                    link_name = link.text
                    link_url = link['href']
                    ref_links.append((link_name,link_url))
            reference['links'] = ref_links
        except:
            reference['links'] = ref_links
        references.append(reference)
    return references

In [157]:
def get_biorxiv_sections(sections_html):
    '''
    '''
    sections = []
    for section in sections_html:
        # ignore references
        if 'ref-list' not in section.attrs['class'] :
            sections.append(explore_section(section))
    return sections

In [139]:
def get_biorxiv_tables(html_tables):
    tables = []
    for html_table in html_tables:
        table = {}
        table['id'] = html_table['id']

        table_label = html_table.find('span',attrs={'class':'table-label'})
        if table_label:
            table['label'] = table_label.text

        table_caption = html_table.find('span',attrs={'class':'caption-title'})
        if table_caption:
            table['caption'] = table_caption.text

        table['description'] = ''
        for p in html_table.find_all('p'):
            table['description'] += p.text + '\n'
        table['image'] = None
        tables.append(table)
    return tables

In [141]:
def get_biorxiv_figures(figures_html):
    figures = []
    for html_figure in figures_html:
        figure = {}

        figure_label = html_figure.find('span',attrs={'class':'fig-label'})
        if figure_label:
            figure['label'] = figure_label.text

        figure_title = html_figure.find('span',attrs={'class':'caption-title'})
        if figure_title:
            figure['title'] = figure_title.text

        figure['description'] = ''
        figure_ps = html_figure.find_all('p')
        for p in figure_ps:
            figure['description'] += p.text + '\n'
        figures.append(figure)
        figure_img = html_figure.find('img')
        if figure_img:
            figure['img'] = figure_img['data-src']
    return figures

In [130]:
def process_paragraph(paragraph):
    ''' Removes and modifies html tags in paragraphs
    '''
    for anchor in paragraph.find_all('a'):
        anchor.replace_with('[' + anchor['href'].strip('#') + ',' + anchor.text + ']')
    return paragraph.text

In [131]:
def explore_section(root_tag):
    '''Recursively explore section and all subsections for content
    '''
    content = {}
    content['title'] = ''
    content['content'] = []
    for child in root_tag.findChildren(recursive=False):
        if child.name == 'h2' or child.name == 'h3' or child.name == 'h4':
            content['title'] = child.text
        elif child.name == 'p':
            content['content'].append(process_paragraph(child))
        if child.name == 'div':
            if 'class' in child.attrs:
                if 'subsection' in child['class']:
                    # if subsection explore recursively
                    content['content'].append(explore_section(child))
            elif 'id' in child.attrs.keys():
                if 'T' in child['id']:
                    content.append({'id':child['id'],'type':'table'})
                elif 'F' in child['id']:
                    content.append({'id':child['id'],'type':'figure'})
    else:
        return content

In [9]:
def biorxiv_html_paper_data_extractor(soup):
    '''
        Extract paper information from html webpage of biorxiv
    '''
    paper = {}
    paper['title'] = soup.h1.text

    authors_html = soup.find('span',attrs={'class':'highwire-citation-authors'}) \
                       .find_all('span',attrs={'class':'highwire-citation-author'})
    paper['authors'] = get_biorxiv_authors(authors_html)

    sections_html = soup.find_all('div',attrs={'class':'section'})
    paper['sections'] = get_biorxiv_sections(sections_html)

    tables_html = soup.find_all('div',attrs={'class':'table'})
    paper['tables'] = get_biorxiv_tables(tables_html)

    # get figures
    figures_html = soup.find_all('div',attrs={'class':'fig'})
    

    paper['figures'] = get_biorxiv_figures(figures_html)

    # get references
    references_html = soup.find('div',attrs={'class':'section ref-list'}).find_all('li')
    paper['references'] = get_biorxiv_references(references_html)
    
    return paper

In [15]:
papers_df = df_meta[df_meta['source_x']=='biorxiv'][['sha','doi']]
papers_array = papers_df.to_numpy()

### Single Paper Test

In [16]:
row = papers_array[0]
print(row)

['f056da9c64fbf00a4645ae326e8a4339d015d155' '10.1101/001727']


In [142]:
paper = {}
paper['source'] = 'biorxiv'
paper['id'] = row[0]
paper['url'] = 'https://biorxiv.org/content/' + row[1].strip('doi.org') + 'v1.full'
print('Fetching paper data at {}'.format(paper['url']))
html = session.get(paper['url'])
tmp_soup = BeautifulSoup(html.content,'html.parser')
paper_webdata = biorxiv_html_paper_data_extractor(tmp_soup)
paper = dict(chain(paper.items(),paper_webdata.items()))

Fetching paper data at https://biorxiv.org/content/10.1101/001727v1.full


In [145]:
i = 0
for section in paper['sections']:
    print(i,section['title'], len(section['content']))
    i += 1

0 Abstract 1
1 Introduction 2
2 Methods 2
3 Results 4
4 Discussion 4
5 Appendix 1: Target Pathogen Database 308
6 Appendix 2: Viral Database 1717
7 Appendix 3: Bacterial Database 1068
8 References 0


In [149]:
paper['references']

[{'label': '',
  'authors': ['Ames SK',
   'Hysom DA',
   'Gardner SN',
   'Lloyd GS',
   'Gokhale MB',
   'Allen JE'],
  'date': '2013',
  'title': 'Scalable metagenomic taxonomy classification using a reference genome database',
  'links': [('CrossRef',
    '/lookup/external-ref?access_num=10.1093/bioinformatics/btt389&link_type=DOI'),
   ('PubMed',
    '/lookup/external-ref?access_num=23828782&link_type=MED&atom=%2Fbiorxiv%2Fearly%2F2014%2F01%2F10%2F001727.atom')]},
 {'label': '',
  'authors': ['Bazinet AL', 'Cummings MP'],
  'date': '2012',
  'title': 'A comparative evaluation of sequence classification programs',
  'links': [('CrossRef',
    '/lookup/external-ref?access_num=10.1186/1471-2105-13-92&link_type=DOI'),
   ('PubMed',
    '/lookup/external-ref?access_num=22574964&link_type=MED&atom=%2Fbiorxiv%2Fearly%2F2014%2F01%2F10%2F001727.atom')]},
 {'label': '',
  'authors': ['Berendzen J',
   'Bruno WJ',
   'Cohn JD',
   'Hengartner NW',
   'Kuske CR',
   'McMahon BH',
   'Wolinsky

In [132]:
section = explore_section(methods)

In [137]:
section['content'][0]['content'][0]

'Metagenomic classification methods are based on a wide variety of theoretical underpinnings. The basic varieties include alignment of reads to various nucleotide databases or exact matching to nucleotide or protein signature sequences (or kmers). A representative set of recent methods are described in [T1,Table 1] (also see [ref-2,Bazinet & Cummings 2012]).'

### Get All Papers

In [53]:
# get papers from data/biorxiv
existing_paper_ids = set([name.strip('.json') for name in get_file_names('data/biorxiv')])

# go through all papers
# new_paper_cnt = 0
for row in papers_array:
    if row[0] not in existing_paper_ids:
        paper = {}
        paper['source'] = 'biorxiv'
        paper['id'] = row[0]
        paper['url'] = 'https://biorxiv.org/content/' + row[1].strip('doi.org') + 'v1.full'
        print('Fetching paper data at {}'.format(paper['url']))
        html = session.get(paper['url'])
        tmp_soup = BeautifulSoup(html.content,'html.parser')
        paper_webdata = biorxiv_html_paper_data_extractor(tmp_soup)
        paper = dict(chain(paper.items(),paper_webdata.items()))    
        with open('data/biorxiv/{}.json'.format(paper['id']), 'w') as fp:
            json.dump(paper, fp, indent=4)
        print('Paper ID: {} saved to data/biorxiv.'.format(paper['id']))
        new_paper_cnt += 1
        print('-'*60)
print('Added {} papers to data/biorxiv.'.format(new_paper_cnt))

Fetching paper data at https://biorxiv.org/content/10.1101/010389v1.full
Paper ID: eccef80cfbe078235df22398f195d5db462d8000 saved to data/biorxiv.
------------------------------------------------------------
Fetching paper data at https://biorxiv.org/content/10.1101/012070v1.full
Paper ID: c41fdb2efd6d61384a92a84cbba3f8233629a41b saved to data/biorxiv.
------------------------------------------------------------
Fetching paper data at https://biorxiv.org/content/10.1101/018481v1.full
Paper ID: 33565294e6bc67fb7ee14dcae6cfdb08148f4ea5 saved to data/biorxiv.
------------------------------------------------------------
Fetching paper data at https://biorxiv.org/content/10.1101/027722v1.full
Paper ID: 1f9d3f9a1a0e8db6a086e0a2b5ba50cf9f235dae saved to data/biorxiv.
------------------------------------------------------------
Fetching paper data at https://biorxiv.org/content/10.1101/029397v1.full
Paper ID: 01e3b313e78a352593be2ff64927192af66619b5 saved to data/biorxiv.
---------------------

AttributeError: 'NoneType' object has no attribute 'text'

In [41]:
url = 'https://biorxiv.org/content/10.1101/010389v1.full'
html = session.get(url)
tmp_soup = BeautifulSoup(html.content,'html.parser')

In [51]:
sections = {}
for section in tmp_soup.find_all('div',attrs={'class':'section'}):
    section_header = section.find('h2')
    if section_header:
        section_name = section_header.text
    
    if section_header and section_name != 'References' and section_name != 'Acknowledgments':
        #print(section_name)
        sections[section_name] = ''
        for p in section.find_all('p'):
            sections[section_name] += p.text + '\n'
        #print(sections[section_name])
        #print()
print(sections)

{'Abstract': 'Background Developing methods to reconstruct transmission histories for viral outbreaks could provide critical information to support locating sources of disease transmission. Phylogenetic methods used to measure the degree of relatedness among sequenced viral samples have proven useful in identifying potential outbreak sources. The complex nature of infectious disease, however, makes it difficult to assign a rigorously defined quantitative confidence value assessing the likelihood of a true direct transmission event using genetic data alone.\nResults A new method is presented to calculate a confidence value assessing the likelihood of a transmission event using both phylogenetic inference and limited knowledge of incubation and infectious duration times. The method is applied to simulations of a foot and mouth disease (FMD) outbreak to demonstrate how the combination of both phylogenetic and epidemiology data can be used to strengthen the assessment of the likelihood of 

In [18]:
# get tables
tables = []
html_tables = tmp_soup.find_all('div',attrs={'class':'table'})
for html_table in html_tables:
    table = {}
    table['id'] = html_table['id']
    label = html_table.find('span',attrs={'class':'table-label'})
    if label:
        table['label'] = label.text
    caption = html_table.find('span',attrs={'class':'caption-title'})
    if caption:
        table['caption'] = caption.text
    table['description'] = ''
    for p in html_table.find_all('p'):
        table['description'] += p.text + '\n'
    table['image'] = None
    tables.append(table)

for table in tables:
    print(table)

{'id': 'T1', 'label': 'Table 1.', 'description': 'Summary of methods for metagenomic classification.\n', 'image': None}
{'id': 'T2', 'label': 'Table 2.', 'caption': 'The abundance of each target organism in each set of simulated datasets. Each set is indicated by the number in the top row, and was generated with 50 replicates.', 'description': '', 'image': None}


In [231]:
html_figures[0].find_all('p')

[<p class="first-child" id="p-9">A) For a group of strains belonging to two different species, some regions may be unique to each species (region 1), while other regions may be unique to strains within each species (regions 2 and 3). B) A set of reads are aligned to these genomes, and the ones that align in a species- or strain-specific manner are identified by the combination of genomes to which they align. In this example, Strain B of Species I is the organism identified.</p>]

## Scrape papers from medrxiv

In [None]:
# get papers from data/medrxiv
existing_paper_ids = set([name.strip('.json') for name in get_file_names('data/medrxiv')])

## Scrape papers from CZI
Journals: 1198

In [169]:
df_meta[df_meta['source_x']=='CZI']

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2.002765e+09,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3.003431e+09,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3.006065e+09,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,1.776631e+08,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3.006643e+09,#3242,False
5,0df0d5270a9399cf4e23c0cdd877a80616a9725e,CZI,An updated estimation of the risk of transmiss...,10.1016/j.idm.2020.02.001,,,cc-by-nc-nd,The basic reproduction number of an infectious...,2020,"Tang, Biao; Bragazzi, Nicola Luigi; Li, Qian; ...",Infectious Disease Modelling,3.006029e+09,#729,True
6,f24242580be243d5fc3f432915d86af6854bb8b7,CZI,Real-time forecasts of the 2019-nCoV epidemic ...,10.1016/j.idm.2020.02.002,,,cc-by-nc-nd,The initial cluster of severe pneumonia cases ...,2020,"Roosa, K.; Lee, Y.; Luo, R.; Kirpich, A.; Roth...",Infectious Disease Modelling,3.006029e+09,#865,True
7,d13a685f861b0f1ba05afa6e005311ad1820fd3a,CZI,RETRACTED: Chinese medical staff request inter...,10.1016/s2214-109x(20)30065-6,,32105614.0,cc-by,,2020,"Zeng, Yingchun; Zhen, Yan",The Lancet. Global health,2.627046e+09,#5386,False
8,e1b336d8be1a4c0ccc5a1bf41e48b3b004d3ece1,CZI,COVID-19 outbreak on the Diamond Princess crui...,10.1093/jtm/taaa030,,,cc-by-nc,Cruise ships carry a large number of people in...,2020,"Rocklöv, J.; Sjödin, H.; Wilder-Smith, A.",Journal of Travel Medicine,3.006304e+09,#2926,True
9,e9239100c5493ea914dc23c3d7a262f4326022ac,CZI,Distinct Roles for Sialoside and Protein Recep...,10.1128/mBio.02764-19,,,cc-by,Coronaviruses (CoVs) are common human and anim...,2020,"Qing, Enya; Hantak, Michael; Perlman, Stanley;...",mBio,3.005811e+09,#2427,True


## Scrape papers from PMC
Journals: 16593

In [176]:
papers_df = df_meta[(df_meta['source_x']=='PMC')&(df_meta['sha'].notnull())]
papers_array = papers_df[['sha','doi','pmcid','publish_time','journal']].to_numpy()

In [186]:
pmc_base_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/'
for row in papers_array[:1]:
    paper = {}
    paper['id'] = row[0]
    paper['doi'] = row[1]
    paper['PMCID'] = row[2]
    paper['url'] = pmc_base_url + paper['PMCID']
    paper['date'] = row[3]
    paper['journal'] = row[4]
    print(paper['url'])
    

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1054884


In [187]:
paper['url']

'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1054884'

In [188]:
html = session.get(paper['url'])
tmp_soup = BeautifulSoup(html.content,'html.parser')

In [189]:
tmp_soup

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<!-- AppResources meta begin -->
<script type="text/javascript">var ncbi_startTime = new Date();</script>
<!-- AppResources meta end -->
<!-- TemplateResources meta begin -->
<meta content="" name="paf_template"/>
<!-- TemplateResources meta end -->
<!-- Logger begin -->
<meta content="pmc" name="ncbi_db"/><meta content="article" name="ncbi_pdid"/><meta content="" name="ncbi_acc"/><meta content="plosbiol" name="ncbi_domain"/><meta content="record" name="ncbi_report"/><meta content="fulltext" name="ncbi_type"/><meta content="" name="ncbi_objectid"/><meta content="/articles/PMC1054884/" name="ncbi_pcid"/><meta content="pmc" name="ncbi_app"/>
<!-- Logger end -->
<title>Recombination Every 

In [193]:
# title
tmp_soup.find('h1',attrs={'class':'content-title'}).text

'Recombination Every Day: Abundant Recombination in a Virus during a Single Multi-Cellular Host Infection'

In [200]:
# authors
authors = []
for a in tmp_soup.find('div',attrs={'class':'contrib-group fm-author'}).find_all('a'):
    authors.append(a.text)
print(authors)

['Remy Froissart', 'Denis Roze', 'Marilyne Uzest', 'Lionel Galibert', 'Stephane Blanc', 'Yannis Michalakis']


In [216]:
sections = {}
headers = tmp_soup.find_all('h2')
for header in headers:
    print(header.text)
    paragraphs = header.parent.find_all('p')
    for paragraph in paragraphs:
        print(paragraph.text)
    print()

Abstract
Viral recombination can dramatically impact evolution and epidemiology. In viruses, the recombination rate depends on the frequency of genetic exchange between different viral genomes within an infected host cell and on the frequency at which such co-infections occur. While the recombination rate has been recently evaluated in experimentally co-infected cell cultures for several viruses, direct quantification at the most biologically significant level, that of a host infection, is still lacking. This study fills this gap using the cauliflower mosaic virus as a model. We distributed four neutral markers along the viral genome, and co-inoculated host plants with marker-containing and wild-type viruses. The frequency of recombinant genomes was evaluated 21 d post-inoculation. On average, over 50% of viral genomes recovered after a single host infection were recombinants, clearly indicating that recombination is very frequent in this virus. Estimates of the recombination rate show

In [54]:
import random

hash = random.getrandbits(128)

print("hash value: %032x" % hash)

hash value: 4fee63fd9f20798a7ceea04311081650


# Test

In [106]:
source = 'biorxiv'
print("\nThis script only scrapes articles from {}, if not already downloaded.".format(source))

session = requests.session()

# import list of articles in data folder
existing_shas = [name.strip('.json') for name in get_file_names('data/{}'.format(source))]

# import metadata dataframe
df_meta = pd.read_csv('metadata.csv')

'''
sources = df_meta['source_x'].unique()
for source in sources:
    papers_df = df_meta[df_meta['source_x']==source][['sha','doi']]
    papers_array = papers_df.to_numpy()
    print('There are {} papers ids from {}.'.format(len(papers_array), source))
'''

missing_articles = df_meta[(df_meta['source_x']=='biorxiv')&(~df_meta['sha'].isin(existing_shas))]
print("{} articles to scrape".format(missing_articles.shape[0]))

papers_df = missing_articles[['sha','doi']]
papers_array = papers_df.to_numpy()
indexes = papers_df.index.to_numpy()


This script only scrapes articles from biorxiv, if not already downloaded.
581 articles to scrape


In [115]:
# keep track of index
i = 0
modified = False
for row in papers_array:
    paper = {}
    paper['source'] = source
    # if no sha => generate one that doesn't exist.
    if str(row[0]) == 'nan':
        paper['id'] = generate_sha(existing_shas)
        # save new hash and add to existing_shas
        existing_shas.append(paper['id'])
        df_meta.loc[indexes[i], 'sha'] = paper['id']
        modified = True
    else:
        paper['id'] = row[0]
    paper['url'] = 'https://biorxiv.org/content/' + row[1].strip('doi.org') + 'v1.full'
    if source == 'biorxiv':
        paper = scrape_biorxiv(paper)
    else:
        print('Error source not biorxiv. ({})'.format(source))
    print(paper['title'],paper['id'])
    '''
    with open('data/{}/{}.json'.format(source,paper['id'])):
        json.dump(paper, fp, indent=4)
    '''
    print('Paper ID: {} saved to data/{}'.format(paper['id'], source))
    print('-'*60)
    i += 1

# if dataframe was changed, save it to csv
if modified:
    df_meta.to_csv('new_metadata.csv')

Fetching paper data at https://biorxiv.org/content/10.1101/030742v1.full
What’s in my pot? Real-time species identification on the MinION™ 4e5ebec665ed176a51fac9eda002eb08
Paper ID: 4e5ebec665ed176a51fac9eda002eb08 saved to data/biorxiv
------------------------------------------------------------
Fetching paper data at https://biorxiv.org/content/10.1101/060434v1.full
The ATP synthase subunit β (ATP5B) is an entry factor for the hepatitis E virus 7b3cb4af46c7492fb2cdd23e6e3bd751
Paper ID: 7b3cb4af46c7492fb2cdd23e6e3bd751 saved to data/biorxiv
------------------------------------------------------------
Fetching paper data at https://biorxiv.org/content/10.1101/073098v1.full


AttributeError: 'NoneType' object has no attribute 'text'

In [114]:
df_meta.to_csv('new_metadata.csv')

In [109]:
df_meta.loc[indexes[i], 'sha']

nan