In [2]:
import spacy

# Loads NLP English model
nlp = spacy.load('en')

In [3]:
# Reads text
text = "On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic. In \
an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \
long-term impact of this pandemic on the health system and the global economy. However, the precise timeline of the disease, \
its transmissibility, and the efect of mitigation strategies remain incompletely understood. Here we integrate a global network \
model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States. For \
the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \
and an infectious period of 17.82 ± 2.95 days. We postulate that the latent and infectious periods are disease-specifc, whereas \
the contact period is behavior-specifc and can vary between diferent provinces, states, or countries. Our network model predicts \
that—without the massive political mitigation strategies that are in place today— \
the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May \
10, 2020 with 3 million infections. Our results demonstrate how mathematical modeling can help estimate outbreak dynamics \
and provide decision guidelines for successful outbreak control. We anticipate that our model will become a valuable tool to \
estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in \
place, and travel restrictions for low-risk subgroups of the population or for the population as a whole."

In [4]:
# Convert text into NLP object
textdoc = nlp(text)

In [5]:
from spacy import displacy
# Named Entity Recognition
displacy.serve(textdoc, style='ent')




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [6]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents[:5]

['On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.',
 'In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy.',
 'However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.',
 'Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.',
 'For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.']

In [7]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

  


Unnamed: 0,Sentence
0,"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic."
1,"In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy."
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood."
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days."
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries."
6,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections."
7,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.
8,"We anticipate that our model will become a valuable tool to estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in place, and travel restrictions for low-risk subgroups of the population or for the population as a whole."


In [8]:
# Listing keywords
numeric_keywords = ['time','number*','ratio','proportion','period','±','total*','estimate*','%']
specific_keywords = ['infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
numeric_regex = '|'.join(numeric_keywords)
specific_regex = '|'.join(specific_keywords)

trait_keywords = numeric_keywords + specific_keywords
trait_regex = '|'.join(trait_keywords)
trait_regex

'time|number*|ratio|proportion|period|±|total*|estimate*|%|infections|death*|transmis*|laten*|contact|infectious|incubat*|casualties|mortal*|morbid*|outbreak*'

In [9]:
import re

'''
Parses a sentence, looking for keywords.

args: sentence - string of sentence.
return: specific_keyword_count - total number of specific keywords in sentence
        numeric_keyword_count - total number of numeric keywords in sentence
        contextual_keyword_count - total number of contextual entity labels in sentence
        cardinality - total number of numeric entity labels in sentence
'''
def countKeywords(sentence):
    specific_keyword_count = numeric_keyword_count = contextual_keyword_count = cardinality = 0
    
    # counts specific and numeric keywords first
    for word in sentence.split():
        if re.match(specific_regex, word): specific_keyword_count += 1
        elif re.match(numeric_regex, word): numeric_keyword_count += 1
    
    # counts key entity labels
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    for label in ent_labels:
        if label in contextual_keywords: contextual_keyword_count += 1
        elif label in ['CARDINAL']: cardinality += 1
        
    return specific_keyword_count, numeric_keyword_count, contextual_keyword_count, cardinality

In [11]:
sentsdf['SKC'],  sentsdf['NKC'],\
sentsdf['CKC'], sentsdf['Cardinality'] = zip(*sentsdf['Sentence'].apply(countKeywords))
sentsdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
0,"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.",0,0,2,0
1,"In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy.",0,1,0,0
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.",1,1,0,0
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.,1,0,2,0
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",3,2,0,0
6,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections.",2,2,3,2
7,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.,2,1,0,0
8,"We anticipate that our model will become a valuable tool to estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in place, and travel restrictions for low-risk subgroups of the population or for the population as a whole.",0,2,0,0


In [12]:
# An example of filtering
sentsdf_filt = sentsdf.sort_values(by="Cardinality", ascending=False)
sentsdf_filt

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
6,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections.",2,2,3,2
0,"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.",0,0,2,0
1,"In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy.",0,1,0,0
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.",1,1,0,0
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.,1,0,2,0
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",3,2,0,0
7,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.,2,1,0,0
8,"We anticipate that our model will become a valuable tool to estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in place, and travel restrictions for low-risk subgroups of the population or for the population as a whole.",0,2,0,0


In [14]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by='SKC',ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",3,2,0,0
6,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections.",2,2,3,2
7,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.,2,1,0,0
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.",1,1,0,0


In [17]:
'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf_top):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

df1 = sentencesWith()
df1.loc[df1.Cardinality > 0]

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
6,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections.",2,2,3,2


In [18]:
import numpy as np 

dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction"
}
statsdf = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
statsdf.index.name = 'Parameter'
statsdf

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
infectious,,,
contact,,,
reproduction,,,
latency,,,


In [19]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

idx = None
def funnel_values(estimates, parameter, rule_name):
    global statsdf
    if parameter:
        statsdf.at[parameter,'Estimates'] = estimates
        statsdf.at[parameter,'Citation'] = idx
        statsdf.at[parameter,'Rule'] = rule_name
    elif idx not in list(statsdf.Citation) and\
        estimates not in list(statsdf.Estimates.loc[statsdf.Citation == idx]) :
        statsdf.loc[len(statsdf)] = [estimates, idx, rule_name]

In [20]:
sentsdf.Sentence[4]

'For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.'

In [21]:
pm_rule = [{"IS_ALPHA":True},{"IS_ALPHA":True},{"IS_ALPHA":True},\
           {"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]

def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map")
        
matcher.add("pm_rule", pm_map, pm_rule)

In [22]:
statsdf = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
statsdf.index.name = 'Parameter'
statsdf

sents_filt = sentencesWith(specific_keywords)
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    sentence = sentence.replace("–"," – ")
    doc = nlp(sentence)
    matches = matcher(doc)
statsdf

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
infectious,"(14.870000000000001, 20.77)",4,pm_map
contact,"(1.15, 1.79)",4,pm_map
reproduction,"(4.35, 6.25)",6,pm_map
latency,"(1.84, 3.2800000000000002)",4,pm_map
