# Metadata Analysis

In [1]:
import pandas as pd
import numpy as np
import re
%load_ext autoreload
%autoreload 2
from cord.cord19 import ResearchPapers
from pathlib import Path, PurePath
from IPython.display import display

pd.options.display.max_colwidth=200
pd.options.display.max_rows = 4000

In [2]:
data_path = Path('data') / 'CORD-19-research-challenge'
metadata_path = PurePath(data_path) / 'metadata.csv'

## Load Metadata

In [3]:
from cord.core import describe_dataframe
metadata = ResearchPapers.load_metadata()
describe_dataframe(metadata)

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata


Unnamed: 0,non-null,null,unique,duplicate,most common
cord_uid,47298,0,47296,2,4fbr8fx8
sha,34283,13015,34275,8,4644c32551fb23aa873a7738ecc8d777bd49877e
source,47298,0,6,47292,PMC
title,47298,0,44864,2434,
doi,47298,0,43955,3343,
pmcid,28038,19260,28038,0,PMC3521240
pubmed_id,35409,11889,35404,5,27381971
license,47298,0,13,47285,els-covid
abstract,47298,0,45578,1720,
published,47289,9,6412,40877,2020-01-01 00:00:00


In [29]:
metadata[metadata.sha.duplicated(keep=False)].dropna(subset=['sha']).sort_values(['sha'])

Unnamed: 0,cord_uid,sha,source,title,doi,pmcid,pubmed_id,license,abstract,published,...,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,when,covid_related,virus,coronavirus,sars
881,7gm6ovet,45e40b2d7d973ed5c9798da613fb3cfa4427e2e2,Elsevier,"Vaccine biotechnology edited by James L. Bittle and Frederick L. Murphy Academic Press, 1989. US$75.00 (iv + 444 pages) ISBN 0 12 039233 X",10.1016/0167-7799(89)90108-x,,,els-covid,"Vaccine biotechnology edited by James L. Bittle and Frederick L. Murphy Academic Press, 1989. US$75.00 (iv + 444 pages) ISBN 0 12 039233 X",1989-08-31,...,,True,False,custom_license,https://doi.org/10.1016/0167-7799(89)90108-x,31 years ago,False,False,False,False
882,c75e60nm,45e40b2d7d973ed5c9798da613fb3cfa4427e2e2,Elsevier,,10.1016/0167-7799(89)90109-1,,,els-covid,,1989-08-31,...,,True,False,custom_license,https://doi.org/10.1016/0167-7799(89)90109-1,31 years ago,False,False,False,False
14322,4k1k02r8,4644c32551fb23aa873a7738ecc8d777bd49877e,Elsevier,PIV-33 Detection of oseltamivir-resistant influenza A(H1N1) viruses with H274Y mutation during 2007–2008 influenza season from central and eastern part of Turkey,10.1016/s1386-6532(09)70129-5,,,els-covid,PIV-33 Detection of oseltamivir-resistant influenza A(H1N1) viruses with H274Y mutation during 2007–2008 influenza season from central and eastern part of Turkey,2009-09-30,...,,True,False,custom_license,https://doi.org/10.1016/s1386-6532(09)70129-5,10 years ago,False,True,False,False
14323,bcn6tpnx,4644c32551fb23aa873a7738ecc8d777bd49877e,Elsevier,PIV-34 A fast procedure for the detection of the new influenza virus A/H1N1 variant,10.1016/s1386-6532(09)70130-1,,,els-covid,PIV-34 A fast procedure for the detection of the new influenza virus A/H1N1 variant,2009-09-30,...,,True,False,custom_license,https://doi.org/10.1016/s1386-6532(09)70130-1,10 years ago,False,True,False,False
14324,n8n1folf,4644c32551fb23aa873a7738ecc8d777bd49877e,Elsevier,PIV-35 Evaluation of two newly developed QIAsymphony sp protocols for efficient isolation of influenza virus RNA from different respiratory samples,10.1016/s1386-6532(09)70131-3,,,els-covid,PIV-35 Evaluation of two newly developed QIAsymphony sp protocols for efficient isolation of influenza virus RNA from different respiratory samples,2009-09-30,...,,True,False,custom_license,https://doi.org/10.1016/s1386-6532(09)70131-3,10 years ago,False,True,False,False
14325,jof3mnnk,4644c32551fb23aa873a7738ecc8d777bd49877e,Elsevier,PIV-36 Performance of the Qiagen Resplex II ver. 2.0 & ver. 3.0 multiplex assays for the detection of (H1N1V) pandemic influenza A in a London teaching hospital,10.1016/s1386-6532(09)70132-5,,,els-covid,PIV-36 Performance of the Qiagen Resplex II ver. 2.0 & ver. 3.0 multiplex assays for the detection of (H1N1V) pandemic influenza A in a London teaching hospital,2009-09-30,...,,True,False,custom_license,https://doi.org/10.1016/s1386-6532(09)70132-5,10 years ago,False,False,False,False
6655,mq8rwnqu,9ce0a6cfd53840cd985f7a1439708c7a48bb7f23,Elsevier,Coronavirus and gastroenteritis in foals,10.1016/s0140-6736(75)80058-4,,78197.0,els-covid,Coronavirus and gastroenteritis in foals,1975-10-25,...,,True,False,custom_license,https://doi.org/10.1016/s0140-6736(75)80058-4,44 years ago,False,True,True,False
6656,fa21ckbr,9ce0a6cfd53840cd985f7a1439708c7a48bb7f23,Elsevier,Hypo-osmolality in beer drinkers,10.1016/s0140-6736(75)80059-6,,78198.0,els-covid,Hypo-osmolality in beer drinkers,1975-10-25,...,,True,False,custom_license,https://doi.org/10.1016/s0140-6736(75)80059-6,44 years ago,False,False,False,False
6657,l2banbg8,9ce0a6cfd53840cd985f7a1439708c7a48bb7f23,Elsevier,A pamphlet to answer patients' questions,10.1016/s0140-6736(75)80060-2,,78196.0,els-covid,A pamphlet to answer patients' questions,1975-10-25,...,,True,False,custom_license,https://doi.org/10.1016/s0140-6736(75)80060-2,44 years ago,False,False,False,False
6683,qfyiu47z,ba4afe00e152de82121a4445aed52c46833d927e,Elsevier,FAILURE OF SALT TO MOBILISE RENAL DOPAMINE IN ESSENTIAL HYPERTENSION,10.1016/s0140-6736(80)92432-0,,6109183.0,els-covid,FAILURE OF SALT TO MOBILISE RENAL DOPAMINE IN ESSENTIAL HYPERTENSION,1980-12-27,...,,True,False,custom_license,https://doi.org/10.1016/s0140-6736(80)92432-0,39 years ago,False,False,False,False


In [30]:
research_papers = ResearchPapers.load()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 40.0 seconds


In [38]:
research_papers["mq8rwnqu"]

Unnamed: 0,published,when,authors,covid_related,doi,journal
,1975-10-25,44 years ago,"Bass, E.P.; Sharpee, R.L.",False,10.1016/s0140-6736(75)80058-4,The Lancet


In [None]:
cord_dups = ['c75e60nm', '4k1k02r8']

In [34]:
research_papers['']

Unnamed: 0,Papers,Oldest,Newest,SARS-COV-2,SARS,Coronavirus,Virus,Antivirals
,1,1989-08-31,1989-08-31,0,0,0,0,0

Unnamed: 0,title,abstract,journal,authors,published,when
882,,,Trends in Biotechnology,,1989-08-31,31 years ago


In [14]:
paper = research_papers[100]
paper.authors

'K T Chong, K Apostolov'

In [18]:
paper.summary

'Nephritis in chickens caused by infectious bronchitis virus (IBV) was studied by virological, histological and electron microscopical methods.\nThe T strain of the virus caused only mild respiratory signs in both Rhode Island Red (RIR) and White Leghorn (WL) breeds; the 50 per cent mortality induced was due to acute nephritis.\nAll the infected birds developed high titres of antibody to IBV for up to 30 weeks.\nIn spite of the persistence of antibody, about 35 per cent of the RIR developed chronic progressive nephritis.\nEvidence of extensive coronavirus replication was found in the cells of the tubules.\nIt is concluded that direct virus-induced cell lysis is the primary cause of IBV nephritis.\nIn addition, about 50 per cent of the chronically infected birds also developed brush-border auto-antibody.'

## Load Research Papers Indexing Texts

In [19]:
research_papers = ResearchPapers.load(index='texts')

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the text contents of the papers
Loading json cache files for comm_use_subset


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Loading json cache files for biorxiv_medrxiv
Loading json cache files for noncomm_use_subset
Loading json cache files for custom_license


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

Finished Indexing texts in 68.0 seconds


In [21]:
research_papers.sample(100)

Unnamed: 0,Papers,Oldest,Newest,SARS-COV-2,SARS,Coronavirus,Virus,Antivirals
,100,1978-07-15,2020-06-01,8,7,18,60,6

Unnamed: 0,title,abstract,journal,authors,published,when
1977,CHAPTER 1 Overview of Viruses and Virus Infection,CHAPTER 1 Overview of Viruses and Virus Infection,Viruses and Human Disease,"STRAUSS, JAMES H.; STRAUSS, ELLEN G.",2008-12-31,11 years ago
10335,Why G3139 works poorly in cancer trials but might work well against HIV,"The antisense drug G3139 (oblimersen sodium, Genta, Inc.) is a phosphorothioate oligodeoxynucleotide (ODN) containing unmethylated CpG units, which is targeted to suppress Bcl-2. To date, its eff...",Medical Hypotheses,"Parris, George E.",2007-12-31,12 years ago
30805,Zinc Sulfate in Narrow Range as an In Vitro Anti-HSV-1 Assay,This report explains the employing of a combination test of traditional cell culture with a quantitative real-time PCR for assessment of the antiviral effect of zinc sulfate (ZnSO(4)) on herpes si...,Biol Trace Elem Res,"Fani, Mona; Khodadad, Nastaran; Ebrahimi, Saeedeh; Nahidsamiei, Rahil; Makvandi, Manoocher; Teimoori, Ali; Langari, Hadis",2019-04-26,11 months ago
16637,Antidiabetes and Anti-obesity Activity of Lagerstroemia speciosa,"The leaves of Lagerstroemia speciosa (Lythraceae), a Southeast Asian tree more commonly known as banaba, have been traditionally consumed in various forms by Philippinos for treatment of diabetes ...",Evid Based Complement Alternat Med,"Klein, Guy; Kim, Jaekyung; Himmeldirk, Klaus; Cao, Yanyan; Chen, Xiaozhuo",2007-03-14,13 years ago
40215,GFSWeb: A web tool for genome-based identification of proteins from mass spectrometric samples,"The interpretation of mass spectrometry data for protein identification has become a vital component of proteomics research. However, since most existing software tools rely on protein databases, ...",Journal of Proteome Research,"Wisz, Michael S.; Suarez, Melissa Kimball; Holmes, Mark R.; Giddings, Morgan C.",2004-12-01,15 years ago
41464,Diarrhea may be underestimated: a missing link in 2019 novel coronavirus,"The outbreak of pneumonia caused by the 2019 Novel Coronavirus (2019-nCoV) was reported in Wuhan City, China. However, the clinical symptoms varied in different reports. Based on the results of in...",,Weicheng Liang; Zhijie Feng; Shitao Rao; Cuicui Xiao; Zexiao Lin; Qi Zhang; Wei Qi,2020-02-11,1 month ago
7838,6-Thioguanine inhibits rotavirus replication through suppression of Rac1 GDP/GTP cycling,Rotavirus infection has emerged as an important cause of complications in organ transplantation recipients and might play a role in the pathogenesis of inflammatory bowel disease (IBD). 6-Thiogua...,Antiviral Research,"Yin, Yuebang; Chen, Sunrui; Hakim, Mohamad S.; Wang, Wenshi; Xu, Lei; Dang, Wen; Qu, Changbo; Verhaar, Auke P.; Su, Junhong; Fuhler, Gwenny M.; Peppelenbosch, Maikel P.; Pan, Qiuwei",2018-08-31,2 years ago
9721,Seasonal and Pandemic A (H1N1) 2009 influenza vaccination coverage and attitudes among health-care workers in a Spanish University Hospital,Influenza vaccination coverage among health-care workers (HCWs) remains the lowest compared with other priority groups for immunization. Little is known about the acceptability and compliance wit...,Vaccine,"Vírseda, Silvia; Restrepo, María Alejandra; Arranz, Elena; Magán-Tapia, Purificación; Fernández-Ruiz, Mario; de la Cámara, Agustín Gómez; Aguado, José María; López-Medrano, Francisco",2010-07-05,10 years ago
7959,Zoonotic disease risk perceptions and infection control practices of Australian veterinarians: Call for change in work culture,"This study was conducted to determine the perceptions of zoonotic disease risk among Australian veterinarians, the infection control practices they use to protect themselves from zoonotic disease...",Preventive Veterinary Medicine,"Dowd, Karen; Taylor, Melanie; Toribio, Jenny-Ann L.M.L.; Hooker, Claire; Dhand, Navneet K.",2013-08-01,7 years ago
37144,Glycoprotein D of Bovine Herpesvirus 5 (BoHV-5) Confers an Extended Host Range to BoHV-1 but Does Not Contribute to Invasion of the Brain,"Bovine herpesvirus 1 (BoHV-1) and BoHV-5 are closely related pathogens of cattle, but only BoHV-5 is considered a neuropathogen. We engineered intertypic gD exchange mutants with BoHV-1 and BoHV-5...",Journal of Virology,"Gabev, Evgeni; Tobler, Kurt; Abril, Carlos; Hilbe, Monika; Senn, Claudia; Franchini, Marco; Campadelli-Fiume, Gabriella; Fraefel, Cornel; Ackermann, Mathias",2010-03-10,10 years ago


In [22]:
research_papers.search('Why G3139 works poorly in cancer trials but might work well against HIV')