In [1]:
import spacy

# Loads NLP English model
nlp = spacy.load('en')

In [2]:
# Reads text
text = "On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic. In \
an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \
long-term impact of this pandemic on the health system and the global economy. However, the precise timeline of the disease, \
its transmissibility, and the efect of mitigation strategies remain incompletely understood. Here we integrate a global network \
model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States. For \
the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \
and an infectious period of 17.82 ± 2.95 days. We postulate that the latent and infectious periods are disease-specifc, whereas \
the contact period is behavior-specifc and can vary between diferent provinces, states, or countries. Our network model predicts \
that—without the massive political mitigation strategies that are in place today— \
the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May \
10, 2020 with 3 million infections. Our results demonstrate how mathematical modeling can help estimate outbreak dynamics \
and provide decision guidelines for successful outbreak control. We anticipate that our model will become a valuable tool to \
estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in \
place, and travel restrictions for low-risk subgroups of the population or for the population as a whole."

In [3]:
# Convert text into NLP object
textdoc = nlp(text)

In [4]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents[:5]

['On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.',
 'In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy.',
 'However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.',
 'Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.',
 'For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.']

In [6]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic."
1,"In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy."
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood."
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days."
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries."
6,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections."
7,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.
8,"We anticipate that our model will become a valuable tool to estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in place, and travel restrictions for low-risk subgroups of the population or for the population as a whole."


In [12]:
import numpy as np 

dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction"
}
statsdf = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
statsdf.index.name = 'Parameter'

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
latency,,,
infectious,,,
contact,,,
reproduction,,,


In [13]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

idx = None
def funnel_values(estimates, parameter, rule_name):
    global statsdf
    if parameter:
        statsdf.at[parameter,'Estimates'] = estimates
        statsdf.at[parameter,'Citation'] = idx
        statsdf.at[parameter,'Rule'] = rule_name
    elif idx not in list(statsdf.Citation) and\
        estimates not in list(statsdf.Estimates.loc[statsdf.Citation == idx]) :
        statsdf.loc[len(statsdf)] = [estimates, idx, rule_name]

In [9]:
sentsdf.Sentence[4]

'For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.'

In [10]:
pm_rule = [{"IS_ALPHA":True},{"IS_ALPHA":True},{"IS_ALPHA":True},\
           {"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]

def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map")
        
matcher.add("pm_rule", pm_map, pm_rule)

In [11]:
statsdf = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
statsdf.index.name = 'Parameter'
statsdf

for idx in sentsdf.index:
    sentence = sentsdf.Sentence[idx]
    doc = nlp(sentence)
    matches = matcher(doc)
statsdf

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
latency,"(1.84, 3.2800000000000002)",4,pm_map
infectious,"(14.870000000000001, 20.77)",4,pm_map
contact,"(1.15, 1.79)",4,pm_map
reproduction,"(4.35, 6.25)",6,pm_map
