Work on Kaggle's ["COVID-19 Open Research Dataset Challenge (CORD-19)"](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge/)

In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

In [2]:
def peek(dataframe):
    print(dataframe.shape)
    print(dataframe.columns)
    return dataframe.head()

def open_json(path):
    with open(path) as json_file:
        data = json.load(json_file)
    return data

class Paper:
    def __init__(self, path):
        self.path = path
        self.d = open_json(path)
        self.id = self.d['paper_id']
        self.title = self.d['metadata']['title']
        self.authors = self.d['metadata']['authors']
        self.body_text = ''.join([x['text'] for x in self.d['body_text']])
        self.bib_entries = self.d['bib_entries']
        self.ref_entries = self.d['ref_entries']
        self.back_matter = self.d['back_matter']
        
        try:
            self.abstract = self.d['abstract'][0]['text']
        except:
            self.abstract = None


In [3]:
PROJECT_PATH = Path("/Users/bmcmahon/Code/data/kaggle_covid19/")
os.listdir(PROJECT_PATH)

['.DS_Store',
 'custom_license',
 'metadata.readme',
 'json_schema.txt',
 'noncomm_use_subset',
 'cord_19_embeddings_4_17',
 'metadata.csv',
 'CORD-19-research-challenge.zip',
 'biorxiv_medrxiv',
 'COVID.DATA.LIC.AGMT.pdf',
 'comm_use_subset']

In [28]:
os.listdir(PROJECT_PATH / "noncomm_use_subset/noncomm_use_subset/pmc_json/")[0]

'PMC4834006.xml.json'

### Data Exploration

#### Metadata

In [4]:
metadata = pd.read_csv(PROJECT_PATH / "metadata.csv")
peek(metadata)

(52398, 18)
Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_pdf_parse',
       'has_pmc_xml_parse', 'full_text_file', 'url'],
      dtype='object')


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350.0,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [29]:
metadata[metadata['pmcid']=="PMC4834006"]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
6267,bxxsmftw,d24afd9ee025b53015824be203db539009964fbd,PMC,Thin-section Computed Tomography Detects Long-...,10.4103/0366-6999.154285,PMC4834006,25836610.0,cc-by-nc-sa,BACKGROUND: The aim of this research was to ev...,2015-04-05,"Xing, Zhi-Heng; Sun, Xin; Xu, Long; Wu, Qi; Li...",Chin Med J (Engl),,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...


#### Embeddings

In [5]:
embeddings = pd.read_csv(PROJECT_PATH / "cord_19_embeddings_4_17/cord_19_embeddings_4_17.csv")
peek(embeddings)

(52397, 769)
Index(['xqhn0vbp', '0.2771094739437103', '-2.9749464988708496',
       '1.10358726978302', '9.312440872192383', '1.3013277053833008',
       '-4.281131267547607', '-0.5318851470947266', '-4.497644424438477',
       '-0.6967141628265381',
       ...
       '-0.31924888491630554', '4.262275695800781', '2.845273971557617',
       '-1.656341552734375', '-2.5844340324401855', '-4.678825855255127',
       '3.175534248352051', '4.8904242515563965', '-1.488284707069397',
       '-1.3013025522232056'],
      dtype='object', length=769)


Unnamed: 0,xqhn0vbp,0.2771094739437103,-2.9749464988708496,1.10358726978302,9.312440872192383,1.3013277053833008,-4.281131267547607,-0.5318851470947266,-4.497644424438477,-0.6967141628265381,...,-0.31924888491630554,4.262275695800781,2.845273971557617,-1.656341552734375,-2.5844340324401855,-4.678825855255127,3.175534248352051,4.8904242515563965,-1.488284707069397,-1.3013025522232056
0,gi6uaa83,-0.34476,-4.762074,3.647769,2.616938,3.856543,-0.460272,0.899565,2.284138,0.590927,...,-0.185362,0.603534,1.475987,1.804034,2.81978,-4.039684,-3.027682,0.997251,-0.661524,1.590745
1,le0ogx1s,-3.257521,-1.720686,1.438893,-1.614458,-3.205178,1.125262,-1.420612,-4.270585,-2.557855,...,-1.691883,0.594955,1.336509,0.036567,-2.065922,0.360089,0.832659,2.954574,-4.601091,-1.097578
2,fy4w7xz8,0.920366,-3.546179,-2.537739,6.372102,0.025263,-1.991629,-0.612892,0.983194,-2.316126,...,-0.412801,0.831824,2.101388,3.283052,-3.075248,-4.381618,1.925288,4.607265,-1.254582,-1.320425
3,0qaoam29,-1.145982,-5.231421,-1.958305,5.035599,-1.601498,-0.756577,1.757929,1.490937,0.192156,...,0.131644,3.83646,1.115987,1.524282,-2.017589,-5.10214,3.864315,1.679577,1.871223,-0.295825
4,qj4dh6rg,-1.728899,-5.419925,-0.482808,9.170734,-1.273682,-1.007223,1.836062,3.512957,1.623566,...,-0.16427,4.492704,1.499324,4.371634,-2.533002,-4.532452,2.206654,1.812408,0.771027,-1.834923


In [6]:
# os.listdir(PROJECT_PATH / "biorxiv_medrxiv/biorxiv_medrxiv/pdf_json")

#### Biorxiv

In [7]:
for path in os.listdir(PROJECT_PATH / "biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/"):
    paper = Paper(PROJECT_PATH / f"biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/{path}")
    print(paper.title + "\n")    

Relationship between Average Daily Temperature and Average Cumulative Daily Rate of Confirmed Cases of COVID-19

Multimerization of HIV-1 integrase hinges on conserved SH3-docking platforms

Virus shedding patterns in nasopharyngeal and fecal specimens of COVID-19 patients 2 3

Time-varying transmission dynamics of Novel Coronavirus Pneumonia in China At a Glance Commentary Scientific Knowledge on the Subject: Since

p53 is not necessary for DUX4 pathology

Virological assessment of hospitalized cases of coronavirus disease 2019 *equal contribution **senior authors with equal contribution

Potential impact of seasonal forcing on a SARS-CoV-2 pandemic

Molecular profiling of COVID-19 sera Proteomic and Metabolomic Characterization of COVID-19 Patient Sera Molecular profiling of COVID-19 sera

Evaluating performance of metagenomic characterization algorithms using in silico datasets generated with FASTQSim

Clinical Features of COVID-19-Related Liver Damage

Comparative in vitro transcri

In [8]:
bio1 = Paper(PROJECT_PATH / "biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/005d189d5bd7ac01aee65e934fd3d5186a3f7b27.json")

In [9]:
bio1.title

'Relationship between Average Daily Temperature and Average Cumulative Daily Rate of Confirmed Cases of COVID-19'

In [10]:
path = PROJECT_PATH / "biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/005d189d5bd7ac01aee65e934fd3d5186a3f7b27.json"

In [11]:
bio1 = Paper(path)

In [12]:
bio1.d.keys()

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [13]:
bio1.d['bib_entries']

{'BIBREF0': {'ref_id': 'b0',
  'title': 'WHO, Coronavirus disease (COVID-2019) situation reports',
  'authors': [],
  'year': 2020,
  'venue': '',
  'volume': '',
  'issn': '',
  'pages': '',
  'other_ids': {}},
 'BIBREF1': {'ref_id': 'b1',
  'title': 'Importation and human-to-human transmission of a novel coronavirus in Vietnam',
  'authors': [{'first': 'L', 'middle': ['T'], 'last': 'Phan', 'suffix': ''},
   {'first': 'T', 'middle': ['V'], 'last': 'Nguyen', 'suffix': ''},
   {'first': 'Q', 'middle': ['C'], 'last': 'Luong', 'suffix': ''},
   {'first': 'T', 'middle': ['V'], 'last': 'Nguyen', 'suffix': ''},
   {'first': 'H', 'middle': ['T'], 'last': 'Nguyen', 'suffix': ''},
   {'first': 'H', 'middle': ['Q'], 'last': 'Le', 'suffix': ''},
   {'first': '.', 'middle': ['.'], 'last': 'Pham', 'suffix': ''},
   {'first': 'Q', 'middle': ['D'], 'last': '', 'suffix': ''}],
  'year': 2020,
  'venue': 'New England Journal of Medicine',
  'volume': '382',
  'issn': '9',
  'pages': '872--874',
  'othe

In [14]:
''.join([x['text'] for x in bio1.d['body_text']])

"The outbreak of infectious diseases has always been one of the most important health problems in the world. With the rapid spread of the new coronavirus (COVID- 19) , much attention has been paid to subjects like rate of an epidemic, transition ways, prevention methods, and remaining time of the virus in the environment [1] [2] [3] [4] [5] . Therefore, the behavior of the virus in relation to environmental factors is critical. The coronavirus is one of the epidemic diseases that many countries have experienced in the new type of it, COVID-19, and can become a serious challenge and affect the previous efforts on sustainable development [6] [7] [8] [9] .It has been found that there are some delays between the detection of confirmed cases and the actual infection date due to the laboratory test, the time of the announcement of confirmed cases in media, and the time for the patients showing initial symptoms for a total usually of 3 to 5 days [10] [11] [12] . The analysis of Kampf et al. (

In [15]:
bio_pdf1 = open_json(PROJECT_PATH / "biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/005d189d5bd7ac01aee65e934fd3d5186a3f7b27.json")
bio_pdf1

{'paper_id': '005d189d5bd7ac01aee65e934fd3d5186a3f7b27',
 'metadata': {'title': 'Relationship between Average Daily Temperature and Average Cumulative Daily Rate of Confirmed Cases of COVID-19',
  'authors': [{'first': 'Behzad',
    'middle': [],
    'last': 'Pirouz',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 'University of Calabria',
     'location': {'postCode': '87036',
      'settlement': 'Modelling, Rende',
      'country': 'Italy'}},
    'email': 'behzadpirouz@gmail.com'},
   {'first': 'Amirsina',
    'middle': [],
    'last': 'Golmohammadi',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 'Islamic Azad University of Medical Sciences',
     'location': {'settlement': 'Tehran', 'country': 'Iran'}},
    'email': ''},
   {'first': 'Hasti',
    'middle': ['Saeidpour'],
    'last': 'Masouleh',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 'Guilan University of Medical Sciences',
     'location'

In [16]:
bio_pdf1 = pd.read_json(PROJECT_PATH / "biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/005d189d5bd7ac01aee65e934fd3d5186a3f7b27.json", orient='index')
bio_pdf1

Unnamed: 0,0
paper_id,005d189d5bd7ac01aee65e934fd3d5186a3f7b27
metadata,{'title': 'Relationship between Average Daily ...
abstract,[{'text': 'The rapid outbreak of the new Coron...
body_text,[{'text': 'The outbreak of infectious diseases...
bib_entries,"{'BIBREF0': {'ref_id': 'b0', 'title': 'WHO, Co..."
ref_entries,{'FIGREF0': {'text': 'Other studies in this ar...
back_matter,[{'text': 'We are grateful to Professor Manlio...


#### Noncomm Use Subset

In [17]:
noncomm_pmc1 = pd.read_json(PROJECT_PATH / "noncomm_use_subset/noncomm_use_subset/pmc_json/PMC4834006.xml.json", orient='index')
noncomm_pmc1

Unnamed: 0,0
paper_id,PMC4834006
metadata,{'title': 'Thin-section Computed Tomography De...
body_text,"[{'text': 'In 2009, a novel type A influenza (..."
ref_entries,{'TABREF0': {'text': 'Table 1: Demographic dat...
back_matter,[]
bib_entries,{'BIBREF0': {'title': 'Pneumonia and respirato...


In [18]:
noncomm_pmc1.loc['metadata',0]

{'title': 'Thin-section Computed Tomography Detects Long-term Pulmonary Sequelae 3 Years after Novel Influenza A Virus-associated Pneumonia',
 'authors': [{'first': 'Zhi-Heng',
   'middle': [],
   'last': 'Xing',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Xin',
   'middle': [],
   'last': 'Sun',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Long',
   'middle': [],
   'last': 'Xu',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Qi',
   'middle': [],
   'last': 'Wu',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Li',
   'middle': [],
   'last': 'Li',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Xian-Jie',
   'middle': [],
   'last': 'Wu',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Xu-Guang',
   'middle': [],
   'last': 'Shao',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Xin-Qian',
   'middle': [],
   'last':

In [19]:
noncomm_pdf1 = pd.read_json(PROJECT_PATH / "noncomm_use_subset/noncomm_use_subset/pdf_json/b2f67d533f2749807f2537f3775b39da3b186051.json", orient='index')
noncomm_pdf1

Unnamed: 0,0
paper_id,b2f67d533f2749807f2537f3775b39da3b186051
metadata,{'title': 'Caring for persons in detention suf...
abstract,[]
body_text,[{'text': 'There is a disproportionate number ...
bib_entries,"{'BIBREF0': {'ref_id': 'b0', 'title': 'Influen..."
ref_entries,{}
back_matter,[]


In [20]:
noncomm_pdf1.loc['metadata',0]

{'title': 'Caring for persons in detention suffering with mental illness during the Covid-19 outbreak',
 'authors': [{'first': 'Michael',
   'middle': [],
   'last': 'Liebrenz',
   'suffix': '',
   'affiliation': {'laboratory': '',
    'institution': 'University of Bern',
    'location': {'settlement': 'Bern', 'country': 'Switzerland'}},
   'email': ''},
  {'first': 'Dinesh',
   'middle': [],
   'last': 'Bhugra',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': 'Anna',
   'middle': [],
   'last': 'Buadze',
   'suffix': '',
   'affiliation': {'laboratory': '',
    'institution': 'University of Zurich',
    'location': {'settlement': 'Zurich', 'country': 'Switzerland'}},
   'email': ''},
  {'first': 'Roman',
   'middle': [],
   'last': 'Schleifer',
   'suffix': '',
   'affiliation': {'laboratory': '',
    'institution': 'University of Bern',
    'location': {'settlement': 'Bern', 'country': 'Switzerland'}},
   'email': ''}]}

#### Comm Use Subset

In [21]:
comm_pmc1 = pd.read_json(PROJECT_PATH / "comm_use_subset/comm_use_subset/pmc_json/PMC5396510.xml.json", orient='index')
comm_pmc1

Unnamed: 0,0
paper_id,PMC5396510
metadata,{'title': 'Immunological properties of gold na...
body_text,[{'text': 'Gold nanoparticles (GNPs) have attr...
ref_entries,{}
back_matter,[]
bib_entries,"{'BIBREF0': {'title': '', 'authors': [], 'year..."


In [22]:
comm_pmc1.loc['metadata',0]

{'title': 'Immunological properties of gold nanoparticles',
 'authors': [{'first': 'Lev',
   'middle': ['A.'],
   'last': 'Dykman',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Nikolai',
   'middle': ['G.'],
   'last': 'Khlebtsov',
   'suffix': '',
   'email': None,
   'affiliation': {}}]}

In [23]:
comm_pdf1 = pd.read_json(PROJECT_PATH / "comm_use_subset/comm_use_subset/pmc_json/PMC5396510.xml.json", orient='index')
comm_pdf1

Unnamed: 0,0
paper_id,PMC5396510
metadata,{'title': 'Immunological properties of gold na...
body_text,[{'text': 'Gold nanoparticles (GNPs) have attr...
ref_entries,{}
back_matter,[]
bib_entries,"{'BIBREF0': {'title': '', 'authors': [], 'year..."


In [24]:
comm_pdf1.loc['metadata',0]

{'title': 'Immunological properties of gold nanoparticles',
 'authors': [{'first': 'Lev',
   'middle': ['A.'],
   'last': 'Dykman',
   'suffix': '',
   'email': None,
   'affiliation': {}},
  {'first': 'Nikolai',
   'middle': ['G.'],
   'last': 'Khlebtsov',
   'suffix': '',
   'email': None,
   'affiliation': {}}]}

In [25]:
comm_pdf1.loc['body_text',0]

[{'text': 'Gold nanoparticles (GNPs) have attracted significant interest as a novel platform in nanobiotechnology and biomedicine because of their convenient surface bioconjugation with molecular probes1 and their remarkable optical2 and immunological3 properties. Recently published examples include applications of GNPs to genomics, biosensorics, immunoassays, clinical chemistry, detection and control of microorganisms, cancer cell photothermolysis, targeted delivery of drugs or other substances, and optical imaging and monitoring of biological cells and tissues.4–6 Noteworthy is the fact that GNPs are being increasingly administered to animals and humans parenterally. In particular, they serve as carriers for the delivery of drugs, genetic materials, and antigens. “Colloidal metallic gold is not bio-inert”—such is the name Brown et al.\n7 gave to their article so as to stress the importance of nanometer size in biological effects, even for such a seemingly inert material as gold.',
  