In [0]:
import numpy as np
import pandas as pd
import os
import json
from pprint import pprint
from copy import deepcopy
from tqdm.notebook import tqdm

In [0]:
metadata = pd.read_csv('all_sources_metadata_2020-03-13.csv')

In [0]:
metadata.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2002765000.0,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3003431000.0,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065000.0,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663100.0,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3006643000.0,#3242,False


In [0]:
metadata.shape

(29500, 14)

# BIORXIV

In [0]:
biorxiv_dir = 'biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print('No of articles from biorxiv: ',len(filenames))

No of articles from biorxiv:  803


In [0]:
all_files = []

for filename in tqdm(filenames):
    filename = biorxiv_dir + filename
    file = json.load(open(filename,'rb'))
    all_files.append(file)

HBox(children=(FloatProgress(value=0.0, max=803.0), HTML(value='')))




In [0]:
file = all_files[0]
print('Dictionary Keys:', file.keys())

Dictionary Keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [0]:
def format_body(body_text):
    texts = [(subset['section'],subset['text']) for subset in body_text]
    texts_subset = {subset['section']: "" for subset in body_text}
    
    for section,text in texts:
        texts_subset[section] += text
    
    body = ''
    for section,text in texts_subset.items():
        body += text
        body += '\n\n'
    
    return body

In [0]:
def format_name(author):
    mid_name = ' '.join(author['middle'])
    if author['middle']:
        return ' '.join([author['first'],mid_name,author['last']])
    else:
        return ' '.join([author['first'],author['last']])

In [0]:
def format_authors(authors,with_affiliation = False):
    name_list = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_list.append(f"{name} ({affiliation})")
            else:
                name_list.append(name)
        else:
            name_list.append(name)
            
    return ", ".join(name_list)

In [0]:
def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
        
    institution = affiliation.get('instution')
    if institution:
        text = [institution] + text
    return ', '.join(text)

In [0]:
def format_location(authors,with_affiliation = False):
    location_list = []
    for author in authors:
        if with_affiliation:
            affiliation = author['affiliation']
            location = affiliation.get('location')
            if location:
                location_list.append(set(affiliation['location'].values()))
            else:
                pass
        else:
            pass
    location_list_clean = [i for n, i in enumerate(location_list) if i not in location_list[n + 1:]] 
    return location_list_clean

In [0]:
def format_institution(authors,with_affiliation = False):
    institution_list = []
    for author in authors:
        if with_affiliation:
            affiliation = author['affiliation']
            institution = affiliation.get('institution')
            if institution:
                institution_list.append(affiliation['institution'])
            else:
                pass
        else:
            pass
    inst_list_clean = [i for n, i in enumerate(institution_list) if i not in institution_list[n + 1:]] 
    return inst_list_clean

In [0]:
def format_bib(bibs):
    if  type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted_text = []
    
    for bib in bibs:
        bib['authors'] = format_authors(bib['authors'],with_affiliation = False)
        formatted_list =  [str(bib[key]) for key in ['title','authors','venue','year']]
        formatted_text.append(', '.join(formatted_list))
        return '; '.join(formatted_text)

In [0]:
print(format_body(file['body_text'])[:3000])

VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).

The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, probabl

In [0]:
def json_to_df():
    clean_files = []

    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], with_affiliation =True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries'],
            format_location(file['metadata']['authors'],with_affiliation =True),
            format_institution(file['metadata']['authors'],with_affiliation =True)
            ]
        clean_files.append(features)
        
    print('Cleaning of files completed.')    
    column_names = [
        'paper_id',
        'title',
        'authors',
        'affiliations',
        'abstract',
        'text',
        'bibliography',
        'raw_authors',
        'raw_bibliography',
        'location',
        'institution'
        ]
    
    df =  pd.DataFrame(clean_files,columns =column_names)
    return df

In [0]:
df = json_to_df()
df.head()

HBox(children=(FloatProgress(value=0.0, max=803.0), HTML(value='')))


Cleaning of files completed.


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,location,institution
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic...",[],[]
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, † , ...","Hanchu Zhou (Hong Kong, China), Jiannan Yang (...",,The 2019-nCoV epidemic has spread across China...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H...","[{Hong Kong, China}, {Beijing, China}]","[City University of Hong Kong, Chinese Academy..."
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (30602, Athens, GA, USA), Eric C...",Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by...",Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen...","[{Blacksburg, 24061, USA, VA}, {USA, 30602, GA...",[Virginia Polytechnical Institute and State Un...
3,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (UK), Isaac I Bogoch (Toronto, Ca...",Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p...",A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel...","[{China, Changsha}, {China, Shanghai}, {Canada...","[St. Michael's Hospital, National University o..."
4,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...","Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhongh...","Xiaoyang Ji (Beijing, China), Chunming Zhang (...",Faced with the current large-scale public heal...,The sudden outbreak of the new coronavirus (SA...,A pneumonia outbreak associated with a new cor...,"[{'first': 'Xiaoyang', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A pneum...","[{Beijing, China}]","[Phil Rivers Technology, Chinese Academy of Sc..."


In [0]:
df.to_csv('biorxiv_medrxiv.csv',index = False, header = True)

In [0]:
comm_use_subset_dir = 'comm_use_subset/comm_use_subset/'
filenames = os.listdir(comm_use_subset_dir)
print('No of articles from comm_use_subset: ',len(filenames))

No of articles from comm_use_subset:  9000


In [0]:
all_files = []

for filename in tqdm(filenames):
    filename = comm_use_subset_dir + filename
    file = json.load(open(filename,'rb'))
    all_files.append(file)
    
file = all_files[0]
print('Dictionary Keys:', file.keys())

HBox(children=(FloatProgress(value=0.0, max=9000.0), HTML(value='')))


Dictionary Keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [0]:
df_comm = json_to_df()
df_comm.head()
df_comm.to_csv('comm_use_subset.csv',index = False, header = True)

HBox(children=(FloatProgress(value=0.0, max=9000.0), HTML(value='')))


Cleaning of files completed.


In [0]:
noncomm_use_subset

In [0]:
noncomm_use_subset_dir = 'noncomm_use_subset/noncomm_use_subset/'
filenames = os.listdir(noncomm_use_subset_dir)
print('No of articles from comm_use_subset: ',len(filenames))

all_files = []

for filename in tqdm(filenames):
    filename = noncomm_use_subset_dir + filename
    file = json.load(open(filename,'rb'))
    all_files.append(file)
    
file = all_files[0]
print('Dictionary Keys:', file.keys())

No of articles from comm_use_subset:  1973


HBox(children=(FloatProgress(value=0.0, max=1973.0), HTML(value='')))


Dictionary Keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [0]:
df_noncomm = json_to_df()
df_noncomm.head()
df_noncomm.to_csv('noncomm_use_subset.csv',index = False, header = True)

HBox(children=(FloatProgress(value=0.0, max=1973.0), HTML(value='')))


Cleaning of files completed.


In [0]:
pmc_custom_license_dir = 'pmc_custom_license/pmc_custom_license/'
filenames = os.listdir(pmc_custom_license_dir)
print('No of articles from comm_use_subset: ',len(filenames))

all_files = []

for filename in tqdm(filenames):
    filename = pmc_custom_license_dir + filename
    file = json.load(open(filename,'rb'))
    all_files.append(file)
    
file = all_files[0]
print('Dictionary Keys:', file.keys())

No of articles from comm_use_subset:  1426


HBox(children=(FloatProgress(value=0.0, max=1426.0), HTML(value='')))


Dictionary Keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [0]:
df_pmc = json_to_df()
df_pmc.head()
df_pmc.to_csv('pmc_custom_license.csv',index = False, header = True)

HBox(children=(FloatProgress(value=0.0, max=1426.0), HTML(value='')))


Cleaning of files completed.


In [0]:
df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,location,institution
0,002f09dfc9a1323a15bf72e349d8b733ac97a2aa,,,,,T he modern word camel is derived from the La...,Ancient and modern DNA reveal dynamics of dome...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Ancient...",[],[]
1,0036e8891c93ae63611bde179ada1e03e8577dea,Stable Occupancy of the Crimean-Congo Hemorrha...,"Florine E M Scholte, Brian L Hua, Jessica R Sp...","Florine E M Scholte (Atlanta, Georgia, USA), B...",Abstract Crimean-Congo hemorrhagic fever virus...,to Western Europe with the assistance of migr...,Seroepidemiological studies of Crimean-Congo h...,"[{'first': 'Florine', 'middle': ['E M'], 'last...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Seroepi...","[{Athens, USA, Georgia}, {Atlanta, USA, Georgia}]","[University of Georgia, National Center for Em..."
2,00573277e6be50669016f770bc28ec2da0639a8f,Asymptomatic Severe Acute Respiratory Syndrome...,,,,We identified a nurse who was asymptomatic fo...,Coronavirus as a possible cause of severe acut...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Coronav...",[],[]
3,00683d59d56123ae85e080d00ef1b3edd3f7405d,A Rift Valley fever (RVF) epidemic affecting a...,"Raphaëlle Métras, Marc Baguelin, W John Edmund...","Raphaëlle Métras, Marc Baguelin, W John Edmund...",Abstract The first cases occurred after heavy ...,R ift Valley fever (RVF) is a zoonotic arbovir...,Rift Valley fever virus (Bunyaviridae: Phlebov...,"[{'first': 'Raphaëlle', 'middle': [], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Rift Va...",[],[]
4,0104f6ceccf92ae8567a0102f89cbb976969a774,BMC Medical Genetics Association of HLA class ...,"Marie Lin, Hsiang-Kuang Tseng, Jean A Trejaut,...","Marie Lin (Taipei, Taiwan), Hsiang-Kuang Tseng...",Abstract The human leukocyte antigen (HLA) sys...,"patient group, a further significant increase...",for surveillance of severe acute respiratory s...,"[{'first': 'Marie', 'middle': [], 'last': 'Lin...","{'BIBREF0': {'ref_id': 'b0', 'title': 'for sur...","[{Taipei, Taiwan}]","[Taipei Municipal Hoping Hospital, National Ta..."


In [0]:
df.abstract[0]

'Abstract word count: 194 22 Text word count: 5168 23 24 25 author/funder. All rights reserved. No reuse allowed without permission. Abstract 27 The positive stranded RNA genomes of picornaviruses comprise a single large open reading 28 frame flanked by 5′ and 3′ untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 has an unusually large 5′ UTR (1.3 kb) containing five structural domains. These include the 30 internal ribosome entry site (IRES), which facilitates initiation of translation, and the cis-acting 31 replication element (cre). Less well characterised structures are a 5′ terminal 360 nucleotide 32 stem-loop, a variable length poly-C-tract of approximately 100-200 nucleotides and a series of 33 two to four tandemly repeated pseudoknots (PKs). We investigated the structures of the PKs 34 by selective 2′ hydroxyl acetylation analysed by primer extension (SHAPE) analysis and 35 determined their contribution to genome replication by mutation and deletion experiments

In [0]:
df.abstract[0]

'SECTION: Abstract\n\n TEXT: word count: 194 22 Text word count: 5168 23 24 25 author/funder. All rights reserved. No reuse allowed without permission. Abstract 27 The positive stranded RNA genomes of picornaviruses comprise a single large open reading 28 frame flanked by 5′ and 3′ untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 has an unusually large 5′ UTR (1.3 kb) containing five structural domains. These include the 30 internal ribosome entry site (IRES), which facilitates initiation of translation, and the cis-acting 31 replication element (cre). Less well characterised structures are a 5′ terminal 360 nucleotide 32 stem-loop, a variable length poly-C-tract of approximately 100-200 nucleotides and a series of 33 two to four tandemly repeated pseudoknots (PKs). We investigated the structures of the PKs 34 by selective 2′ hydroxyl acetylation analysed by primer extension (SHAPE) analysis and 35 determined their contribution to genome replication by mutation and d

In [0]:
df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,location,institution
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",SECTION: Abstract\n\n TEXT: word count: 194 22...,"SECTION: \n\n TEXT: VP3, and VP0 (which is fur...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic...",[],[]
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, † , ...","Hanchu Zhou (Hong Kong, China), Jiannan Yang (...",,SECTION: Introduction\n\n TEXT: The 2019-nCoV ...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H...","[{China, Hong Kong}, {China, Beijing}]","[City University of Hong Kong, Chinese Academy..."
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (30602, Athens, GA, USA), Eric C...",SECTION: Abstract\n\n TEXT: Infectious bronchi...,SECTION: Introduction\n\n TEXT: Infectious bro...,Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen...","[{VA, Blacksburg, 24061, USA}, {Athens, USA, G...",[Virginia Polytechnical Institute and State Un...
3,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (UK), Isaac I Bogoch (Toronto, Ca...",SECTION: Abstract\n\n TEXT: Background: A nove...,SECTION: Introduction\n\n TEXT: In December 20...,A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel...","[{China, Changsha}, {Shanghai, China}, {Toront...","[St. Michael's Hospital, National University o..."
4,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...","Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhongh...","Xiaoyang Ji (Beijing, China), Chunming Zhang (...",SECTION: Abstract\n\n TEXT: Faced with the cur...,SECTION: Introduction\n\n TEXT: The sudden out...,A pneumonia outbreak associated with a new cor...,"[{'first': 'Xiaoyang', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A pneum...","[{China, Beijing}]","[Phil Rivers Technology, Chinese Academy of Sc..."


In [0]:
ls


 Volume in drive C has no label.
 Volume Serial Number is 42D6-4E29

 Directory of C:\Users\Dipanjan Chowdhury\Documents\COVID-19\2020-03-13

03/18/2020  06:12 AM    <DIR>          .
03/18/2020  06:12 AM    <DIR>          ..
03/18/2020  04:46 AM    <DIR>          .ipynb_checkpoints
03/14/2020  01:23 AM        49,211,745 all_sources_metadata_2020-03-13.csv
03/14/2020  01:23 AM             1,000 all_sources_metadata_2020-03-13.readme
03/17/2020  03:58 PM    <DIR>          biorxiv_medrxiv
03/18/2020  05:31 AM        33,127,098 biorxiv_medrxiv.csv
03/17/2020  03:59 PM    <DIR>          comm_use_subset
03/18/2020  05:09 AM       581,464,168 comm_use_subset.csv
03/14/2020  01:23 AM            26,690 COVID.DATA.LIC.AGMT.pdf
03/18/2020  06:12 AM            65,812 covid-19 Data Parsing.ipynb
03/17/2020  04:19 PM           616,960 COVID-19-geographic-disbtribution-worldwide-2020-03-16.xls
03/18/2020  06:08 AM    <DIR>          data
03/17/2020  05:15 PM           121,099 Data Understanding.ipynb


In [0]:
pwd

'C:\\Users\\Dipanjan Chowdhury\\Documents\\COVID-19\\2020-03-13'

In [0]:
import os
import time

start_time = time.time()
final_df = []
os.chdir(r'C:/Users/Dipanjan Chowdhury/Documents/COVID-19/2020-03-13/data')
for filename in os.listdir(r'C:/Users/Dipanjan Chowdhury/Documents/COVID-19/2020-03-13/data'):
    file = filename.replace('.csv','')
    print(filename)
    dataframe = pd.read_csv(filename, encoding = 'iso-8859-1')
    print('Execution Time after reading the data for: ',filename,end = "")
    print('----%s seconds ----' % (time.time() - start_time ))
    df =  dataframe
    df.insert(loc = 0, column = 'source', value = file)
    final_df.append(df)
        
output_csv = pd.concat(final_df,ignore_index = True)

os.chdir(r'C:/Users/Dipanjan Chowdhury/Documents/COVID-19/2020-03-13')
output_csv.to_csv('data_v1.csv',index =False)
print('Total Execution Time : ',end = "")
print('----%s seconds ----' % (time.time() - start_time ))

biorxiv_medrxiv.csv
Execution Time after reading the data for:  biorxiv_medrxiv.csv----0.5312206745147705 seconds ----
comm_use_subset.csv
Execution Time after reading the data for:  comm_use_subset.csv----8.390045881271362 seconds ----
noncomm_use_subset.csv
Execution Time after reading the data for:  noncomm_use_subset.csv----9.811823844909668 seconds ----
pmc_custom_license.csv
Execution Time after reading the data for:  pmc_custom_license.csv----10.546168804168701 seconds ----
Total Execution Time : ----30.57609987258911 seconds ----


In [0]:
output_csv.head()

Unnamed: 0,source,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,location,institution
0,biorxiv_medrxiv,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",SECTION: Abstract\n\n TEXT: word count: 194 22...,"SECTION: \n\n TEXT: VP3, and VP0 (which is fur...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic...",[],[]
1,biorxiv_medrxiv,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, â ...","Hanchu Zhou (Hong Kong, China), Jiannan Yang (...",,SECTION: Introduction\n\n TEXT: The 2019-nCoV ...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H...","[{'China', 'Hong Kong'}, {'China', 'Beijing'}]","['City University of Hong Kong', 'Chinese Acad..."
2,biorxiv_medrxiv,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (30602, Athens, GA, USA), Eric C...",SECTION: Abstract\n\n TEXT: Infectious bronchi...,SECTION: Introduction\n\n TEXT: Infectious bro...,Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen...","[{'VA', 'Blacksburg', '24061', 'USA'}, {'Athen...",['Virginia Polytechnical Institute and State U...
3,biorxiv_medrxiv,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (UK), Isaac I Bogoch (Toronto, Ca...",SECTION: Abstract\n\n TEXT: Background: A nove...,SECTION: Introduction\n\n TEXT: In December 20...,A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel...","[{'China', 'Changsha'}, {'Shanghai', 'China'},...","[""St. Michael's Hospital"", 'National Universit..."
4,biorxiv_medrxiv,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...","Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhongh...","Xiaoyang Ji (Beijing, China), Chunming Zhang (...",SECTION: Abstract\n\n TEXT: Faced with the cur...,SECTION: Introduction\n\n TEXT: The sudden out...,A pneumonia outbreak associated with a new cor...,"[{'first': 'Xiaoyang', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A pneum...","[{'China', 'Beijing'}]","['Phil Rivers Technology', 'Chinese Academy of..."


In [0]:
q1= metadata[metadata['sha']=="0015023cc06b5362d332b3baf348d11567ca2fbb"]
q1

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
28966,0015023cc06b5362d332b3baf348d11567ca2fbb,biorxiv,The RNA pseudoknots in foot-and-mouth disease ...,doi.org/10.1101/2020.01.10.901801,,,See https://www.biorxiv.org/about-biorxiv,The positive stranded RNA genomes of picornavi...,2020-01-11,"Ward, J. C. J.; Lasecka-Dykes, L.; Neil, C.; A...",,,,True


In [0]:
train = output_csv
train.head()

Unnamed: 0,source,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,location,institution
0,biorxiv_medrxiv,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",SECTION: Abstract\n\n TEXT: word count: 194 22...,"SECTION: \n\n TEXT: VP3, and VP0 (which is fur...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic...",[],[]
1,biorxiv_medrxiv,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, â ...","Hanchu Zhou (Hong Kong, China), Jiannan Yang (...",,SECTION: Introduction\n\n TEXT: The 2019-nCoV ...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H...","[{'China', 'Hong Kong'}, {'China', 'Beijing'}]","['City University of Hong Kong', 'Chinese Acad..."
2,biorxiv_medrxiv,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (30602, Athens, GA, USA), Eric C...",SECTION: Abstract\n\n TEXT: Infectious bronchi...,SECTION: Introduction\n\n TEXT: Infectious bro...,Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen...","[{'VA', 'Blacksburg', '24061', 'USA'}, {'Athen...",['Virginia Polytechnical Institute and State U...
3,biorxiv_medrxiv,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (UK), Isaac I Bogoch (Toronto, Ca...",SECTION: Abstract\n\n TEXT: Background: A nove...,SECTION: Introduction\n\n TEXT: In December 20...,A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel...","[{'China', 'Changsha'}, {'Shanghai', 'China'},...","[""St. Michael's Hospital"", 'National Universit..."
4,biorxiv_medrxiv,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...","Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhongh...","Xiaoyang Ji (Beijing, China), Chunming Zhang (...",SECTION: Abstract\n\n TEXT: Faced with the cur...,SECTION: Introduction\n\n TEXT: The sudden out...,A pneumonia outbreak associated with a new cor...,"[{'first': 'Xiaoyang', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A pneum...","[{'China', 'Beijing'}]","['Phil Rivers Technology', 'Chinese Academy of..."


In [0]:
train['abstract'][2]

"SECTION: Abstract\n\n TEXT: Infectious bronchitis (IB) causes significant economic losses in the global poultry industry. Control of infectious bronchitis is hindered by the genetic diversity of the causative agent, infectious bronchitis virus (IBV), which has led to the emergence of several serotypes that lack complete serologic cross-protection. While serotyping by definition requires immunologic characterization, genotyping is an efficient means to identify IBVs detected in samples. Sanger sequencing of the S1 subunit of the spike gene is currently used to genotype IBV; however, the universal S1 PCR was created to work from cultured IBV and it is inefficient at detecting mixed isolates. This paper describes a MinION-based AmpSeq method that genetically typed IBV from clinical samples, including samples with multiple isolates. Total RNA was extracted from fifteen tracheal scrapings and choanal cleft swab samples, randomly reverse transcribed, and PCR amplified using modified S1-targ

In [0]:
df = pd.read_csv('data_v1.csv')

In [0]:
import csv

csv_file = 'data_v1.csv'
txt_file = 'covid19_kaggle.txt'

with open(txt_file,"w") as my_output_file:
    with open(csv_file,'r',encoding='utf8') as my_input_file:
        [my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

UnicodeEncodeError: 'charmap' codec can't encode character '\x80' in position 952: character maps to <undefined>

Unnamed: 0,source,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,location,institution
0,biorxiv_medrxiv,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",SECTION: Abstract\n\n TEXT: word count: 194 22...,"SECTION: \n\n TEXT: VP3, and VP0 (which is fur...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic...",[],[]
1,biorxiv_medrxiv,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, â ...","Hanchu Zhou (Hong Kong, China), Jiannan Yang (...",,SECTION: Introduction\n\n TEXT: The 2019-nCoV ...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H...","[{'China', 'Hong Kong'}, {'China', 'Beijing'}]","['City University of Hong Kong', 'Chinese Acad..."
2,biorxiv_medrxiv,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (30602, Athens, GA, USA), Eric C...",SECTION: Abstract\n\n TEXT: Infectious bronchi...,SECTION: Introduction\n\n TEXT: Infectious bro...,Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen...","[{'VA', 'Blacksburg', '24061', 'USA'}, {'Athen...",['Virginia Polytechnical Institute and State U...
3,biorxiv_medrxiv,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (UK), Isaac I Bogoch (Toronto, Ca...",SECTION: Abstract\n\n TEXT: Background: A nove...,SECTION: Introduction\n\n TEXT: In December 20...,A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel...","[{'China', 'Changsha'}, {'Shanghai', 'China'},...","[""St. Michael's Hospital"", 'National Universit..."
4,biorxiv_medrxiv,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...","Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhongh...","Xiaoyang Ji (Beijing, China), Chunming Zhang (...",SECTION: Abstract\n\n TEXT: Faced with the cur...,SECTION: Introduction\n\n TEXT: The sudden out...,A pneumonia outbreak associated with a new cor...,"[{'first': 'Xiaoyang', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A pneum...","[{'China', 'Beijing'}]","['Phil Rivers Technology', 'Chinese Academy of..."
...,...,...,...,...,...,...,...,...,...,...,...,...
13197,pmc_custom_license,ff365ebbc0fc55476886b0abd129e227c1f8a527,Article focus Hip,"M Pahuta, J M Smolders, J L Van Susante, J Pec...","M Pahuta, J M Smolders, J L Van Susante, J Pec...",Abstract\n\nWe report a systematic review and ...,introduction\n\nDespite the fact that total hi...,The operation of the century: total hip replac...,"[{'first': 'M', 'middle': [], 'last': 'Pahuta'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The ope...",[],[]
13198,pmc_custom_license,ff7d49ac4008f60ef9c5a437e0d504dcefd1246f,,Alex R Cook,"Alex R Cook (2-3-6 Minami, Wako, Saitama, Japan)",,\n\nresults of studies conducted in other coun...,Epidemiology of infl uenza A(H1N1) virus infec...,"[{'first': 'Alex', 'middle': ['R'], 'last': 'C...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Epidemi...","[{'Japan', 'Saitama', 'Wako', '2-3-6 Minami'}]",['National Institute of Public Health']
13199,pmc_custom_license,ffb381668d93248759ca3855425e05722cb9f562,,,,,\n\nH uman coronaviruses (HCoVs) were first re...,"Identification of a human coronavirus, L Van D...",[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Identif...",[],[]
13200,pmc_custom_license,ffd3a93b927e221ded4cf76536ad31bef2c74b89,Fatal Respiratory Infections Associated with R...,"Le Thanh Hai, Vu Thi, Ngoc Bich, Le Kien Ngai,...","Le Thanh Hai, Vu Thi, Ngoc Bich, Le Kien Ngai,...",Abstract\n\nDuring an outbreak of severe acute...,\n\nDuring an outbreak of severe acute respira...,Children: reducing mortality. World Health Org...,"[{'first': 'Le', 'middle': ['Thanh'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Childre...",[],[]
