# Metadata Analysis

In [294]:
import pandas as pd
import numpy as np
import re
from cord.cord19 import ResearchPapers
from pathlib import Path, PurePath
from IPython.display import display

pd.options.display.max_colwidth=200
pd.options.display.max_rows = 4000

## Load Metadata

In [281]:
metadata = ResearchPapers.load_metadata()
data_path = Path('data') / 'CORD-19-research-challenge'
metadata_path = PurePath(data_path) / 'metadata.csv'
metadata = pd.read_csv(metadata_path,
                               dtype={'Microsoft Academic Paper ID': str,
                                      'pubmed_id': str})

## Metadata Utility Functions

In [304]:
def describe_column(series):
    col_counts = series.describe().loc[['count', 'unique','top']]
    col_counts.loc['null'] = series.isnull().sum()
    col_counts['duplicate'] = series.dropna().duplicated().sum()
    df = col_counts.to_frame().T
    df = df[['count', 'null', 'unique', 'duplicate','top']] \
                .rename(columns={'count' : 'non-null', 'top':'most common'}).T
    return df

def describe_columns(df, columns=None):
    columns = columns or df.columns
    column_descs = [describe_column(df[col]).T for col in columns]
    return pd.concat(column_descs)

def show_common(data, column, head=20):
    common_column = data[column].value_counts().to_frame()
    common_column = common_column[common_column[column] > 1]
    return common_column.head(head)

## Describe Metadata

In [266]:
describe_columns(metadata)

Unnamed: 0,non-null,null,unique,duplicate,most common
sha,28462,15758,28450,12,72a5640aa0c307fbe171ca7ad55d3fda48b53988
source_x,44220,0,7,44213,PMC
title,43996,224,42230,1766,Index
doi,40750,3470,40750,0,10.1016/j.jinf.2020.01.014
pmcid,23319,20901,23319,0,PMC5841229
pubmed_id,22943,21277,22943,0,27185404
license,44220,0,13,44207,els-covid
abstract,35806,8414,35099,707,Unknown
publish_time,34197,10023,7045,27152,2020
authors,41074,3146,39285,1789,"Parry, Jane"


## Common Titles

In [267]:
show_common(metadata, 'title')

Unnamed: 0,title
Index,348
Subject Index,83
Subject index,76
Author index,69
Contents,68
Articles of Significant Interest Selected from This Issue by the Editors,67
Information for Authors,66
Graphical contents list,36
Table of Contents,29
Infectious disease surveillance update,24


### Common Abstracts

In [268]:
show_common(metadata, 'abstract')

Unnamed: 0,abstract
Unknown,654
Journal Watch presents a brief description of articles recently published in other journals and thought to be of relevance or interest to the AIC readership. Readers are encouraged to refer to the full article for complete information.,3
"Abstract During the millions of years they have coexisted with their hosts, viruses have learned how to manipulate host immune control mechanisms. Viral gene functions provide an overview of many relevant principles in cell biology and immunology. Our knowledge of viral gene functions must be integrated into virus–host interaction networks to understand viral pathogenesis, and could lead to new anti-viral strategies and the ability to exploit viral functions as tools in medicine.",3
"Resumen La faringoamigdalitis aguda (FAA) en el adulto es una de las enfermedades infecciosas más comunes en la consulta del médico de familia. La etiología más frecuente es viral. Dentro de la etiología bacteriana, el principal agente responsable es Streptococcus pyogenes o estreptococo β-hemolítico del grupo A (EBHGA), causante del 5-30% de los casos. En el manejo diagnóstico, las escalas de valoración clínica para predecir la posible etiología bacteriana, son una buena ayuda para seleccionar a qué pacientes se deben practicar las técnicas de detección rápida de antígeno estreptocócico. Es conocido que, en general, sin estas técnicas se tiende al sobrediagnóstico de FAA estreptocócica, con la consiguiente prescripción innecesaria de antibióticos, muchas veces de amplio espectro. Así, con el manejo de las escalas y la técnica de diagnóstico rápido, elaboramos los algoritmos de manejo de la FAA. Los objetivos del tratamiento son acelerar la resolución de los síntomas, reducir el tiempo de contagio y prevenir las complicaciones supurativas locales y no supurativas. Los antibióticos de elección para el tratamiento de la FAA estreptocócica son penicilina y amoxicilina. La asociación de amoxicilina y clavulánico no está indicada en el tratamiento inicial en la infección aguda. Los macrólidos tampoco son un tratamiento de primera elección; su uso debe reservarse para pacientes con alergia a la penicilina. Es importante en nuestro país adecuar tanto el diagnóstico de la FAA bacteriana y la prescripción de antibióticos a la evidencia científica disponible. La implantación de protocolos de actuación en las farmacias comunitarias puede ser de utilidad para identificar y cribar los casos que no requieran tratamiento antibiótico. Abstract Acute pharyngitis in adults is one of the most common infectious diseases seen in general practitioners’ consultations. Viral aetiology is the most common. Among bacterial causes, the main agent is Streptococcus pyogenes or group A β-haemolytic streptococcus (GABHS), which causes 5%-30% of the episodes. In the diagnostic process, clinical assessment scales can help clinicians to better predict suspected bacterial aetiology by selecting patients who should undergo a rapid antigen detection test. If these techniques are not performed, an overdiagnosis of streptococcal pharyngitis often occurs, resulting in unnecessary prescriptions of antibiotics, most of which are broad spectrum. Consequently, management algorithms that include the use of predictive clinical rules and rapid tests have been set up. The aim of the treatment is speeding up symptom resolution, reducing the contagious time span and preventing local suppurative and non-suppurative complications. Penicillin and amoxicillin are the antibiotics of choice for the treatment of pharyngitis. The association of amoxicillin and clavulanate is not indicated as the initial treatment of acute infection. Neither are macrolides indicated as first-line therapy; they should be reserved for patients allergic to penicillin. The appropriate diagnosis of bacterial pharyngitis and proper use of antibiotics based on the scientific evidence available are crucial. Using management algorithms can be helpful in identifying and screening the cases that do not require antibiotic therapy.",3
"Abstract In 2009, a novel H1N1 Influenza virus has emerged and on June 11 the World Health Organization declared it as pandemic. It may cause acute respiratory failure ranging from severe Acute Respiratory Distress Syndrome to exacerbations of airflow limitation. Non-invasive ventilation is now considered first-line intervention for different causes of acute respiratory failure and may be considered in the context of H1N1 pandemic. Although infection control issues have been arisen, non-invasive ventilation was effective and safe during the Severe Acute Respiratory Syndrome in Asia. It is reasonable to recommend non-invasive ventilation in H1N1-related exacerbations of chronic respiratory diseases, especially in negative-pressure wards. Treatment of early Acute Respiratory Distress Syndrome associated with H1N1 using non-invasive ventilation could be tried rapidly identifying those who fail without delaying endotracheal intubation. Considering the high demand for critical care beds during the pandemic, non-invasive ventilation may have a role in reducing the estimated load.",2
[Image: see text],2
"The critical period for the prevention and control of novel coronavirus pneumonia (NCP) in China, in response to requirements for accelerating the modernization of the disease prevention and control system, we analyzed and summarized the current situation, existing problems, and deficiencies in China's modernization of disease prevention and control system. In addition, we put forward the contents and countermeasures for the modernization of the disease prevention and control system. The modernization of the disease prevention and control system should be built around governance modernization, talent modernization, equipment modernization, scientific research modernization, and modernization of the regulatory system. The countermeasures and suggestions need to reposition the disease prevention and control system, rationalize the management system and operating mechanism, strengthen the modernization of talents and equipment, strengthen scientific research on disease prevention and control, and further improve the disease prevention and control legal system.",2
"Interferon induced transmembrane proteins (IFITMs) inhibit the cellular entry of a broad range of viruses, but it has been suspected that for HIV-1 IFITMs may also inhibit a post-integration replicative step. We show that IFITM expression reduces HIV-1 viral protein synthesis by preferentially excluding viral mRNA transcripts from translation and thereby restricts viral production. Codon-optimization of proviral DNA rescues viral translation, implying that IFITM-mediated restriction requires recognition of viral RNA elements. In addition, we find that expression of the viral accessory protein Nef can help overcome the IFITM-mediated inhibition of virus production. Our studies identify a novel role for IFITMs in inhibiting HIV replication at the level of translation, but show that the effects can be overcome by the lentiviral protein Nef.",2
"Résumé L’oxygénation extracorporelle ou extra-corporeal membrane oxygenation (ECMO) constitue une suppléance de l’échangeur respiratoire (oxygénation et décarboxylation) pour laquelle un regain d’intérêt survient depuis quelques années. Cet intérêt a été amplifié par la pandémie grippale H1N1 mais est également suscité par des améliorations technologiques significatives qui autorisent aujourd’hui la mise en œuvre chez des patients éveillés et mobiles. Mais c’est essentiellement l’émergence d’études randomisées comme dans le syndrome de détresse respiratoire de l’adulte avec des résultats encourageants qui valorisent cette thérapeutique. Connaître la terminologie et les principes généraux de cette technique ainsi que les situations et perspectives futures auxquelles le pneumologue et le réanimateur peuvent être confrontés fait l’objet de cette revue générale. Deux points seront en particulier évoqués tels que les perspectives d’épuration extracorporelle du CO2 dans le cadre des décompensations respiratoires hypercapniques et les situations de suppléances au cours de la transplantation pulmonaire depuis la période préopératoire, jusqu’à la phase postopératoire. Summary Extra-corporeal membrane oxygenation (ECMO) effectively replaces the lung in providing oxygenation and carbon dioxide (CO2) removal. For some years, and in parallel to the H1N1 influenza pandemic, this technique has gained interest in relation to significant technological improvements, leading to new concepts of “awake and mobile ECMO” or rehabilitation with ECMO. Finally, the publication of randomized controlled trials giving encouraging results in the adult respiratory distress syndrome (ARDS) has helped to validate this technique and further studies are warranted. This general review aims to outline the definition, classification and principles of ECMO and to give some current information about the indications and possibilities of the technique to the pulmonologist and intensivist. Further possible uses for this technique include extra-corporeal removal of CO2 during hypercapnic respiratory failure and assistance during lung transplantation from the preoperative to the early postoperative period.",2
"Coronavirus testing, mortality, vaccine development, containment vs mitigation, and more. Anthony S. Fauci, MD discusses the latest developments in the global spread of COVID-19 and the SARS-CoV-2 virus with JAMA Editor Howard Bauchner, MD. - What's the difference between COVID-19 and SARS-CoV-2? (01:15) - What's the status and accuracy of diagnostic testing in the US? (01:58) - What's the case-fatality rate for the virus? (05:31) - Scientific advances and vaccine development (25:06) - Are current clinical trials providing a picture of treatments? (13:41) - Risk communication: how do we present information so there's faith that it's accurate? (15:24) - Risk groups (children, the elderly, pregnant women) (16:26) - Containment vs mitigation vs quarantine vs isolation (19:10) - Protecting the elderly and nursing home resident (23:52) - Public health prospects in Latin America, Africa (26:35) - Will coronavirus wane in warmer months like influenza? (27:52) - Why is anxiety so high about this disease?- Does the US have capacity to care for COVID19 infection? (31:03) - What is your daily schedule like? (32:23)",2


In [269]:
abstract_title = ['abstract', 'title']

## Conditions

In [270]:
# The title is null or short
title_null = metadata.title.isnull()
title_short = metadata.title.fillna('').apply(len) < 30

# Some titles are is short and unrelated to viruses
# This regex keeps some short titles if they seem relevant
_relevant_re_ = '.*vir.*|.*sars.*|.*mers.*|.*corona.*|.*ncov.*|.*immun.*|.*nosocomial.*'
_relevant_re_ = _relevant_re_ +  '.*epidem.*|.*emerg.*|.*vacc.*|.*cytokine.*'
title_relevant = metadata.title.fillna('').str.match(_relevant_re_, case=False)
title_junk = title_short & ~title_relevant

# The abstract is null, duplicate, short or missing
abstract_null = metadata.abstract.isnull()
abstract_duplicate = metadata.abstract.duplicated()
abstract_unknown = metadata.abstract == 'Unknown'
abstract_short = metadata.abstract.fillna('').apply(len) < 20
abstract_missing = abstract_null | abstract_unknown
sha_null = metadata.sha.isnull()
doi_null = metadata.doi.isnull()

## Text Processing 

In [305]:
_abstract_terms_ = '(Publisher|Abstract|Summary|BACKGROUND|INTRODUCTION)'
def remove_common_terms(abstract):
    return re.sub(_abstract_terms_, '', abstract)

def start(data):
    return data.copy()

def clean_title(data):
    # Set junk titles to NAN
    data.loc[title_junk, 'title'] = ''
    return data

def clean_abstract(data):
    # Set unknowns to NAN
    data.loc[abstract_unknown, 'abstract'] = np.nan
    
    # Fill missing abstract with the title
    data.abstract = data.abstract.fillna(data.title)
    
    # Remove common terms like publisher
    data.abstract = data.abstract.fillna('').apply(remove_common_terms)
   
    return data
   
clean_metadata = metadata.pipe(start) \
                         .pipe(clean_title) \
                         .pipe(clean_abstract)

### Common Titles in Clean Metadata

In [306]:
show_common(clean_metadata, 'title')

Unnamed: 0,title
,3022
Articles of Significant Interest Selected from This Issue by the Editors,67
Infectious disease surveillance update,24
Opportunities from the Center for Perioperative Education,15
PNAS Plus Significance Statements,12
Current Awareness on Comparative and Functional Genomics,10
New nucleotide sequence data on the EMBL File Server.,8
Bibliography of the current world literature,7
Contents of other veterinary journals from Elsevier,6
Viral gastroenteritis,6


### Common Abstracts in Clean Metadata

In [307]:
show_common(clean_metadata, 'abstract')

Unnamed: 0,abstract
,2531
Articles of Significant Interest Selected from This Issue by the Editors,67
Infectious disease surveillance update,24
Opportunities from the Center for Perioperative Education,15
PNAS Plus Significance Statements,12
Current Awareness on Comparative and Functional Genomics,10
New nucleotide sequence data on the EMBL File Server.,8
Bibliography of the current world literature,7
Contents of other veterinary journals from Elsevier,6
Immunity to infection,5
