In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import yaml
from itertools import chain
import os

In [2]:
def get_file_names(path):
    # get list of papers .json from path
    file_names = []
    for file_name in os.listdir(path):
        file_names.append(file_name)
    return file_names

# Scraper

Scraper to fetch article data online based on article information

In [3]:
# init session
session = requests.session()

In [4]:
# import configurations
with open('config.yaml','r') as ymlfile:
    cfg = yaml.load(ymlfile)

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
# load metadata 
source = 'biorxiv'
url = cfg['data-path'] + '2020-03-13/all_sources_metadata_2020-03-13.csv'
df_meta = pd.read_csv(url)

In [6]:
sources = df_meta['source_x'].unique()

In [149]:
for source in sources:
    #print(source)
    papers_df = df_meta[df_meta['source_x']==source][['sha','doi']]
    papers_array = papers_df.to_numpy()
    print('There are {} paper ids from {}.'.format(len(papers_array),source))
    
    # get papers relative to source

There are 1236 paper ids from CZI.
There are 27337 paper ids from PMC.
There are 566 paper ids from biorxiv.
There are 361 paper ids from medrxiv.


In [160]:
df_meta.groupby(['source_x'])['journal'].count()

source_x
CZI         1198
PMC        16593
biorxiv        0
medrxiv        0
Name: journal, dtype: int64

In [167]:
df_meta[df_meta['source_x']=='PMC']

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
1236,03203ab50eb64271a9e825f94a1b1a6c46ea14b3,PMC,Recombination Every Day: Abundant Recombinatio...,http://dx.doi.org/10.1371/journal.pbio.0030089,PMC1054884,15737066.0,CC BY,Viral recombination can dramatically impact ev...,2005 Mar 1,"['Froissart, Remy', 'Roze, Denis', 'Uzest, Mar...",PLoS Biol,,,True
1237,98a3b0606a67d829816c1d934e2d1a7196985151,PMC,Prospective evaluation of an internet-linked h...,http://dx.doi.org/10.1186/cc2967,PMC1065064,15566586.0,CC BY,INTRODUCTION: Critical care physicians may ben...,2004 Oct 14,"['Lapinsky, Stephen E', 'Wax, Randy', 'Showalt...",Crit Care,,,True
1238,d617306cda56236d02117ae7a5fc5e7fcd015554,PMC,Subversion of Cellular Autophagosomal Machiner...,http://dx.doi.org/10.1371/journal.pbio.0030156,PMC1084330,15884975.0,CC BY,Infection of human cells with poliovirus induc...,2005 May 26,"['Jackson, William T', 'Giddings, Thomas H', '...",PLoS Biol,,,True
1239,d619c3ceec4db4f3f350c3d5fb3842bd83f04a80,PMC,The immediate effects of the severe acute resp...,http://dx.doi.org/10.1186/1471-2458-5-30,PMC1084353,15804368.0,CC BY,BACKGROUND: When an emerging infectious diseas...,2005 Apr 4,"['Lee, Cheng-Hua', 'Huang, Nicole', 'Chang, Ho...",BMC Public Health,,,True
1240,d0c6b0c2d387baae89eb2898969913218b3bedff,PMC,Molecular advances in the cell biology of SARS...,http://dx.doi.org/10.1186/1743-422X-2-35,PMC1087510,15833113.0,CC BY,"In the aftermath of the SARS epidemic, there h...",2005 Apr 15,"['Stark, Caren J', 'Atreya, CD']",Virol J,,,True
1241,e6184d2db86268ba31e49b03a5aab475d5ce5ca6,PMC,Individual sequences in large sets of gene seq...,http://dx.doi.org/10.1186/1471-2105-6-90,PMC1090557,15817134.0,CC BY,BACKGROUND: Most current DNA diagnostic tests ...,2005 Apr 8,"['Gibbs, Mark J', 'Armstrong, John S', 'Gibbs,...",BMC Bioinformatics,,,True
1242,c6008b68c8b16e3a6a48a2cb892bac5c9353df86,PMC,Absence of association between angiotensin con...,http://dx.doi.org/10.1186/1471-2334-5-26,PMC1090578,15819995.0,CC BY,BACKGROUND: It has been postulated that geneti...,2005 Apr 9,"['Chan, KC Allen', 'Tang, Nelson LS', 'Hui, Da...",BMC Infect Dis,,,True
1243,6f07f87e8ef78f0416556e69c88247e588f9192c,PMC,A Three-Stemmed mRNA Pseudoknot in the SARS Co...,http://dx.doi.org/10.1371/journal.pbio.0030172,PMC1110908,15884978.0,CC BY,A wide range of RNA viruses use programmed −1 ...,2005 Jun 17,"['Plant, Ewan P', 'Pérez-Alvarado, Gabriela C'...",PLoS Biol,,,True
1244,99b74061d99f96f6842cf3efea27058d680ed188,PMC,New Frameshifting Pseudoknot Found in SARS Virus,http://dx.doi.org/10.1371/journal.pbio.0030199,PMC1110910,,CC BY,,2005 Jun 17,,PLoS Biol,,,False
1245,9ffde004c991e9cf3c63e9143946a64ffaa9ee2a,PMC,The Microbial Rosetta Stone Database: A compil...,http://dx.doi.org/10.1186/1471-2180-5-19,PMC1127111,15850481.0,CC BY,BACKGROUND: Thousands of different microorgani...,2005 Apr 25,"['Ecker, David J', 'Sampath, Rangarajan', 'Wil...",BMC Microbiol,,,True


In [158]:
journals = df_meta['journal'].unique()
for journal in journals:
    print(journal)

Intensive Care Med
Cell Discovery
Journal of Clinical Medicine
J Clin Med
Infectious Disease Modelling
The Lancet. Global health
Journal of Travel Medicine
mBio
Global Health Research and Policy
Eurosurveillance
Bioinformatics (Oxford, England)
Journal of Korean Medical Science
Pathogens
EClinicalMedicine
Infection, Genetics and Evolution
Microbes and Infection
Cell Research
Journal of travel medicine
Hong Kong Medical Journal
Innovative Biosystems and Bioengineering
Precision Clinical Medicine
Frontiers in Microbiology
International Journal of Environmental Research and Public Health
Protein Cell
Critical Care
Osong Public Health and Research Perspectives
Viruses
Intensive Care Medicine
Data in Brief
Epidemiol Infect
Nature
National Science Review
Biosafety and Health
International Journal of Infectious Diseases
The Lancet Global Health
Infection Control & Hospital Epidemiology
Journal of Immunology Research
Clinical & Translational Immunology
The Lancet Planetary Health
Chin Med J (E

Healthc Policy
Ecol Lett
J Community Hosp Intern Med Perspect
Cell Host Microbe
J Pharmacopuncture
Iran J Basic Med Sci
J Ginseng Res
Clin Exp Emerg Med
J Virus Erad.; 2(Suppl 1):21-52
Int J Cancer
Cell Biochem Funct
J Clin Transl Hepatol
World J Gastroenterol
World J Clin Pediatr
Antiviral Res
Cold Spring Harb Mol Case Stud
Asia Pac J Oncol Nurs
J Autoimmun
Indian Heart J
Mutagenesis
Ann Rehabil Med
Iran Red Crescent Med J
World J Crit Care Med
Korean J Thorac Cardiovasc Surg
Intrinsically Disord Proteins
Disaster Health
Anatol J Cardiol
Intern Med
Curr Drug Targets
J Public Health Africa
J Asthma Allergy
World J Clin Cases
JFMS Open Rep
Int J Surg Case Rep
Reumatologia
Local Reg Anesth
J Cell Death
Curr HIV Res
Comb Chem High Throughput Screen
Mult Scler J Exp Transl Clin
Respir Med Case Rep
Autops Case Rep
Saudi Pharm J
Haematologica
Health Hum Rights
Glob Pediatr Health
Kidney Res Clin Pract
Clin Case Rep
Viral Immunol
Curr Ther Res Clin Exp
Acad Pathol
J Clin Neurol
Ann Neurol
J P

In [156]:
df_meta[(df_meta['source_x']=='CZI')&(df_meta['source_x'] != 'NaN')]

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2.002765e+09,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3.003431e+09,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3.006065e+09,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,1.776631e+08,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3.006643e+09,#3242,False
5,0df0d5270a9399cf4e23c0cdd877a80616a9725e,CZI,An updated estimation of the risk of transmiss...,10.1016/j.idm.2020.02.001,,,cc-by-nc-nd,The basic reproduction number of an infectious...,2020,"Tang, Biao; Bragazzi, Nicola Luigi; Li, Qian; ...",Infectious Disease Modelling,3.006029e+09,#729,True
6,f24242580be243d5fc3f432915d86af6854bb8b7,CZI,Real-time forecasts of the 2019-nCoV epidemic ...,10.1016/j.idm.2020.02.002,,,cc-by-nc-nd,The initial cluster of severe pneumonia cases ...,2020,"Roosa, K.; Lee, Y.; Luo, R.; Kirpich, A.; Roth...",Infectious Disease Modelling,3.006029e+09,#865,True
7,d13a685f861b0f1ba05afa6e005311ad1820fd3a,CZI,RETRACTED: Chinese medical staff request inter...,10.1016/s2214-109x(20)30065-6,,32105614.0,cc-by,,2020,"Zeng, Yingchun; Zhen, Yan",The Lancet. Global health,2.627046e+09,#5386,False
8,e1b336d8be1a4c0ccc5a1bf41e48b3b004d3ece1,CZI,COVID-19 outbreak on the Diamond Princess crui...,10.1093/jtm/taaa030,,,cc-by-nc,Cruise ships carry a large number of people in...,2020,"Rocklöv, J.; Sjödin, H.; Wilder-Smith, A.",Journal of Travel Medicine,3.006304e+09,#2926,True
9,e9239100c5493ea914dc23c3d7a262f4326022ac,CZI,Distinct Roles for Sialoside and Protein Recep...,10.1128/mBio.02764-19,,,cc-by,Coronaviruses (CoVs) are common human and anim...,2020,"Qing, Enya; Hantak, Michael; Perlman, Stanley;...",mBio,3.005811e+09,#2427,True


In [140]:
df_meta[df_meta['source_x']=='medrxiv']

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
29139,090b6c8b3df30bc248221869f673a2d970caa1b9,medrxiv,,doi.org/10.1101/19008417,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29140,10525ac89e46be4cb9cb9fd1131d28411a902047,medrxiv,,doi.org/10.1101/19011940,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29141,b0899a264af548a89d649154aec569889717b295,medrxiv,,doi.org/10.1101/2019.12.17.19013490,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29142,e93ba7d8a047795d5ec114741f32f5b18e8567c7,medrxiv,,doi.org/10.1101/2020.01.15.19015693,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29143,6cb02c7565f6f74a3d165e14196de5e9e87d2d04,medrxiv,,doi.org/10.1101/2020.01.23.20018549,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29144,cbc05d14c57b91081970a232ab83bc993f998fe2,medrxiv,,doi.org/10.1101/2020.01.26.20018754,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29145,fdf006a84a946f24f5905dcea8c5c1ee266c26d2,medrxiv,,doi.org/10.1101/2020.01.26.20018887,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,False
29146,25d49e49a4cb420ec8ed2a703c0ed88d7cd5d0d0,medrxiv,,doi.org/10.1101/2020.01.27.20018952,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,False
29147,12fac9aedb1a09a3922a3c084ce4723708e463d6,medrxiv,,doi.org/10.1101/2020.01.27.20018986,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29148,30b5c7b8faf95265f67ae59f4686eaf9b2772893,medrxiv,,doi.org/10.1101/2020.01.28.20019224,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True


## Scrape papers from bioRxiv

In [26]:
def biorxiv_html_paper_data_extractor(soup):
    '''
        Extract paper information from html webpage of biorxiv
    '''
    paper = {}
    paper['title'] = soup.h1.text

    # paper authors
    authors = []
    for author in soup.find('span',attrs={'class':'highwire-citation-authors'}).find_all('span',attrs={'class':'highwire-citation-author'}):
        author = author.find('span',attrs={'class':'nlm-given-names'}).text + ' ' + author.find('span',attrs={'class':'nlm-surname'}).text
        authors.append(author)
    #print(authors)
    paper['authors'] = authors

    sections = {}
    for section in soup.find_all('div',attrs={'class':'section'}):
        section_name = section.find('h2').text
        if section_name != 'References':
            #print(section_name)
            sections[section_name] = ''
            for p in section.find_all('p'):
                sections[section_name] += p.text + '\n'
            #print(sections[section_name])
            #print()
    paper['sections'] = sections

    # get tables
    tables = []
    html_tables = tmp_soup.find_all('div',attrs={'class':'table'})
    for html_table in html_tables:
        table = {}
        table['id'] = html_table['id']
        
        table_label = html_table.find('span',attrs={'class':'table-label'})
        if table_label:
            table['label'] = table_label.text
            
        table_caption = html_table.find('span',attrs={'class':'caption-title'})
        if table_caption:
            table['caption'] = table_caption.text
            
        table['description'] = ''
        for p in html_table.find_all('p'):
            table['description'] += p.text + '\n'
        table['image'] = None
        tables.append(table)
        
    paper['tables'] = tables
    
    # get figures
    figures = []
    html_figures = tmp_soup.find_all('div',attrs={'class':'fig'})
    for html_figure in html_figures:
        figure = {}
        
        figure_label = html_figure.find('span',attrs={'class':'fig-label'})
        if figure_label:
            figure['label'] = figure_label.text
            
        figure_title = html_figure.find('span',attrs={'class':'caption-title'})
        if figure_title:
            figure['title'] = title.text
            
        figure['description'] = ''
        figure_ps = html_figure.find_all('p')
        for p in figure_ps:
            figure['description'] += p.text + '\n'
        figures.append(figure)
        figure_img = html_figure.find('img')
        if figure_img:
            figure['img'] = figure_img['data-src']
    
    paper['figures'] = figures
    
    # getting references data
    references = []
    for ref in soup.find('div',attrs={'class':'section ref-list'}).find_all('li'):
        reference = {}
        # label
        try:
            reference['label'] = ref.find('span',attrs={'class':'ref-label'}).text
        except:
            reference['label'] = None

        # authors
        author_names = []
        try:
            authors = ref.find('cite').find_all('span',attrs={'class':'cit-auth'})
            for author in authors:
                surname = author.find('span',attrs={'class':'cit-name-surname'}).text 
                given_names = author.find('span',attrs={'class':'cit-name-given-names'}).text
                author_name = surname + ' ' + given_names
                author_names.append(author_name)
            reference['authors'] = author_names
        except:
            reference['authors'] = author_names

        # date
        try:
            reference['date'] = ref.find('cite').find('span',attrs={'class':'cit-pub-date'}).text
        except:
            reference['date'] = None

        # title
        try:
            reference['title'] = ref.find('cite').find('span',attrs={'class':'cit-article-title'}).text
        except:
            reference['title'] = ''

        # links
        ref_links = []
        try:
            links = ref.find('div',attrs={'class':'cit-extra'}).find_all('a')
            for link in links:
                if link.text != 'OpenUrl':
                    link_name = link.text
                    link_url = link['href']
                    ref_links.append((link_name,link_url))
            reference['links'] = ref_links
        except:
            reference['links'] = ref_links
        references.append(reference)
    paper['references'] = references
    #print(references)
    return paper

In [8]:
source = 'biorxiv'
papers_df = df_meta[df_meta['source_x']==source][['sha','doi']]
papers_array = papers_df.to_numpy()

In [218]:
# get papers from data/biorxiv
existing_paper_ids = set([name.strip('.json') for name in get_file_names('data/biorxiv')])

# go through all papers
new_paper_cnt = 0
for row in papers_array[:1]:
    if row[0] not in existing_paper_ids:
        paper = {}
        paper['source'] = 'biorxiv'
        paper['id'] = row[0]
        paper['url'] = 'https://biorxiv.org/content' + row[1].strip('doi.org') + 'v1.full'
        print('Fetching paper data at {}'.format(paper['url']))
        html = session.get(paper['url'])
        tmp_soup = BeautifulSoup(html.content,'html.parser')
        paper_webdata = biorxiv_html_paper_data_extractor(tmp_soup)
        paper = dict(chain(paper.items(),paper_webdata.items()))    
        with open('data/biorxiv/{}.json'.format(paper['id']), 'w') as fp:
            json.dump(paper, fp, indent=4)
        print('Paper ID: {} saved to data/biorxiv.'.format(paper['id']))
        new_paper_cnt += 1
print('Added {} papers to data/biorxiv.'.format(new_paper_cnt))

Fetching paper data at https://biorxiv.org/content/10.1101/001727v1.full
Paper ID: f056da9c64fbf00a4645ae326e8a4339d015d155 saved to data/biorxiv.
Added 1 papers to data/biorxiv.


In [13]:
url = 'https://biorxiv.org/content' + papers_array[0][1].strip('doi.org') + 'v1.full'
html = session.get(url)
tmp_soup = BeautifulSoup(html.content,'html.parser')

In [25]:
# get figures
figures = []
html_figures = tmp_soup.find_all('div',attrs={'class':'fig'})
for html_figure in html_figures:
    figure = {}
    figure_label = html_figure.find('span',attrs={'class':'fig-label'})
    if figure_label:
        figure['label'] = figure_label.text
    figure_title = html_figure.find('span',attrs={'class':'caption-title'})
    if figure_title:
        figure['title'] = title.text
    figure['description'] = ''
    figure_ps = html_figure.find_all('p')
    for p in figure_ps:
        figure['description'] += p.text + '\n'
    figures.append(figure)
    figure_img = html_figure.find('img')
    if figure_img:
        figure['img'] = figure_img['data-src']

for figure in figures:
    print(figure)
    print()

{'label': 'Figure 1.', 'description': 'A) For a group of strains belonging to two different species, some regions may be unique to each species (region 1), while other regions may be unique to strains within each species (regions 2 and 3). B) A set of reads are aligned to these genomes, and the ones that align in a species- or strain-specific manner are identified by the combination of genomes to which they align. In this example, Strain B of Species I is the organism identified.\n', 'img': 'https://www.biorxiv.org/content/biorxiv/early/2014/01/10/001727/F1.medium.gif'}

{'label': 'Figure 2.', 'description': 'Relationship of reported value for each program (horizontal axis, log scale) to the empirically-determined Positive Predictive Value (PPV), shown on the vertical axis. While the exact values depend on the test data used, the general values at significant cutoff values (0.8, 0.9, 0.95 PPV) remain relatively constant across different datasets (data not shown).\n', 'img': 'https://ww

In [18]:
# get tables
tables = []
html_tables = tmp_soup.find_all('div',attrs={'class':'table'})
for html_table in html_tables:
    table = {}
    table['id'] = html_table['id']
    label = html_table.find('span',attrs={'class':'table-label'})
    if label:
        table['label'] = label.text
    caption = html_table.find('span',attrs={'class':'caption-title'})
    if caption:
        table['caption'] = caption.text
    table['description'] = ''
    for p in html_table.find_all('p'):
        table['description'] += p.text + '\n'
    table['image'] = None
    tables.append(table)

for table in tables:
    print(table)

{'id': 'T1', 'label': 'Table 1.', 'description': 'Summary of methods for metagenomic classification.\n', 'image': None}
{'id': 'T2', 'label': 'Table 2.', 'caption': 'The abundance of each target organism in each set of simulated datasets. Each set is indicated by the number in the top row, and was generated with 50 replicates.', 'description': '', 'image': None}


In [231]:
html_figures[0].find_all('p')

[<p class="first-child" id="p-9">A) For a group of strains belonging to two different species, some regions may be unique to each species (region 1), while other regions may be unique to strains within each species (regions 2 and 3). B) A set of reads are aligned to these genomes, and the ones that align in a species- or strain-specific manner are identified by the combination of genomes to which they align. In this example, Strain B of Species I is the organism identified.</p>]

## Scrape papers from medrxiv

In [None]:
# get papers from data/medrxiv
existing_paper_ids = set([name.strip('.json') for name in get_file_names('data/medrxiv')])

## Scrape papers from CZI
Journals: 1198

In [169]:
df_meta[df_meta['source_x']=='CZI']

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2.002765e+09,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3.003431e+09,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3.006065e+09,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,1.776631e+08,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3.006643e+09,#3242,False
5,0df0d5270a9399cf4e23c0cdd877a80616a9725e,CZI,An updated estimation of the risk of transmiss...,10.1016/j.idm.2020.02.001,,,cc-by-nc-nd,The basic reproduction number of an infectious...,2020,"Tang, Biao; Bragazzi, Nicola Luigi; Li, Qian; ...",Infectious Disease Modelling,3.006029e+09,#729,True
6,f24242580be243d5fc3f432915d86af6854bb8b7,CZI,Real-time forecasts of the 2019-nCoV epidemic ...,10.1016/j.idm.2020.02.002,,,cc-by-nc-nd,The initial cluster of severe pneumonia cases ...,2020,"Roosa, K.; Lee, Y.; Luo, R.; Kirpich, A.; Roth...",Infectious Disease Modelling,3.006029e+09,#865,True
7,d13a685f861b0f1ba05afa6e005311ad1820fd3a,CZI,RETRACTED: Chinese medical staff request inter...,10.1016/s2214-109x(20)30065-6,,32105614.0,cc-by,,2020,"Zeng, Yingchun; Zhen, Yan",The Lancet. Global health,2.627046e+09,#5386,False
8,e1b336d8be1a4c0ccc5a1bf41e48b3b004d3ece1,CZI,COVID-19 outbreak on the Diamond Princess crui...,10.1093/jtm/taaa030,,,cc-by-nc,Cruise ships carry a large number of people in...,2020,"Rocklöv, J.; Sjödin, H.; Wilder-Smith, A.",Journal of Travel Medicine,3.006304e+09,#2926,True
9,e9239100c5493ea914dc23c3d7a262f4326022ac,CZI,Distinct Roles for Sialoside and Protein Recep...,10.1128/mBio.02764-19,,,cc-by,Coronaviruses (CoVs) are common human and anim...,2020,"Qing, Enya; Hantak, Michael; Perlman, Stanley;...",mBio,3.005811e+09,#2427,True


## Scrape papers from PMC
Journals: 16593

In [176]:
papers_df = df_meta[(df_meta['source_x']=='PMC')&(df_meta['sha'].notnull())]
papers_array = papers_df[['sha','doi','pmcid','publish_time','journal']].to_numpy()

In [186]:
pmc_base_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/'
for row in papers_array[:1]:
    paper = {}
    paper['id'] = row[0]
    paper['doi'] = row[1]
    paper['PMCID'] = row[2]
    paper['url'] = pmc_base_url + paper['PMCID']
    paper['date'] = row[3]
    paper['journal'] = row[4]
    print(paper['url'])
    

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1054884


In [187]:
paper['url']

'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1054884'

In [188]:
html = session.get(paper['url'])
tmp_soup = BeautifulSoup(html.content,'html.parser')

In [189]:
tmp_soup

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<!-- AppResources meta begin -->
<script type="text/javascript">var ncbi_startTime = new Date();</script>
<!-- AppResources meta end -->
<!-- TemplateResources meta begin -->
<meta content="" name="paf_template"/>
<!-- TemplateResources meta end -->
<!-- Logger begin -->
<meta content="pmc" name="ncbi_db"/><meta content="article" name="ncbi_pdid"/><meta content="" name="ncbi_acc"/><meta content="plosbiol" name="ncbi_domain"/><meta content="record" name="ncbi_report"/><meta content="fulltext" name="ncbi_type"/><meta content="" name="ncbi_objectid"/><meta content="/articles/PMC1054884/" name="ncbi_pcid"/><meta content="pmc" name="ncbi_app"/>
<!-- Logger end -->
<title>Recombination Every 

In [193]:
# title
tmp_soup.find('h1',attrs={'class':'content-title'}).text

'Recombination Every Day: Abundant Recombination in a Virus during a Single Multi-Cellular Host Infection'

In [200]:
# authors
authors = []
for a in tmp_soup.find('div',attrs={'class':'contrib-group fm-author'}).find_all('a'):
    authors.append(a.text)
print(authors)

['Remy Froissart', 'Denis Roze', 'Marilyne Uzest', 'Lionel Galibert', 'Stephane Blanc', 'Yannis Michalakis']


In [216]:
sections = {}
headers = tmp_soup.find_all('h2')
for header in headers:
    print(header.text)
    paragraphs = header.parent.find_all('p')
    for paragraph in paragraphs:
        print(paragraph.text)
    print()

Abstract
Viral recombination can dramatically impact evolution and epidemiology. In viruses, the recombination rate depends on the frequency of genetic exchange between different viral genomes within an infected host cell and on the frequency at which such co-infections occur. While the recombination rate has been recently evaluated in experimentally co-infected cell cultures for several viruses, direct quantification at the most biologically significant level, that of a host infection, is still lacking. This study fills this gap using the cauliflower mosaic virus as a model. We distributed four neutral markers along the viral genome, and co-inoculated host plants with marker-containing and wild-type viruses. The frequency of recombinant genomes was evaluated 21 d post-inoculation. On average, over 50% of viral genomes recovered after a single host infection were recombinants, clearly indicating that recombination is very frequent in this virus. Estimates of the recombination rate show