In [446]:
import spacy    
import unicodedata
import re

In [447]:
# Loads NLP English model
nlp = spacy.load('en')

In [448]:
oliver_testpaper = 'data/texts/ebola/10.1007/s00705-020-04768-3.txt'

# Reads text
if 0:
    with open(oliver_testpaper, 'r') as file:
        text = unicodedata.normalize("NFKD", file.read().replace('\n', ' '))
        text = re.sub(r'(?<=[.])(?=[^\s])', r' ', text)
else:
    text = "On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic. In \
an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \
long-term impact of this pandemic on the health system and the global economy. However, the precise timeline of the disease, \
its transmissibility, and the efect of mitigation strategies remain incompletely understood. Here we integrate a global network \
model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States. For \
the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \
and an infectious period of 17.82 ± 2.95 days. We postulate that the latent and infectious periods are disease-specifc, whereas \
the contact period is behavior-specifc and can vary between diferent provinces, states, or countries. For the early stages of the \
outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of \
3.38 ± 0.69 days. Our network model predicts that—without the massive political mitigation strategies that are in place today— \
the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May \
10, 2020 with 3 million infections. Our results demonstrate how mathematical modeling can help estimate outbreak dynamics \
and provide decision guidelines for successful outbreak control. We anticipate that our model will become a valuable tool to \
estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in \
place, and travel restrictions for low-risk subgroups of the population or for the population as a whole."

In [449]:
# Convert text into NLP object
textdoc = nlp(text)

In [450]:
# Named Entity Recognition
from spacy import displacy 
displacy.render(textdoc, style='ent')

In [451]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents[:5]

['On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.',
 'In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy.',
 'However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.',
 'Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.',
 'For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.']

In [452]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic."
1,"In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy."
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood."
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days."
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries."
6,"For the early stages of the outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of 3.38 ± 0.69 days."
7,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections."
8,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.
9,"We anticipate that our model will become a valuable tool to estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in place, and travel restrictions for low-risk subgroups of the population or for the population as a whole."


In [453]:
# Listing keywords
numeric_keywords = ['distribution*','time','number*','ratio','proportion','period','±','total*','estimate*','%','review','parameter*','mean','period','value']
specific_keywords = ['r0','reproduct*''infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*','epideme*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
numeric_regex = '|'.join(numeric_keywords)
specific_regex = '|'.join(specific_keywords)

trait_keywords = numeric_keywords + specific_keywords
trait_regex = '|'.join(trait_keywords)
trait_regex

numReg = re.compile(numeric_regex)
specReg = re.compile(specific_regex)
traitReg = re.compile(trait_regex)

In [454]:
import re

'''
Parses a sentence, looking for keywords.

args: sentence - string of sentence.
return: specific_keyword_count - total number of specific keywords in sentence
        numeric_keyword_count - total number of numeric keywords in sentence
        contextual_keyword_count - total number of contextual entity labels in sentence
        cardinality - total number of numeric entity labels in sentence
'''
def countKeywords(sentence):
    specific_keyword_count = numeric_keyword_count = contextual_keyword_count = cardinality = 0
    
    # counts specific and numeric keywords first
    for word in sentence.split():
        if re.match(specific_regex, word): specific_keyword_count += 1
        elif re.match(numeric_regex, word): numeric_keyword_count += 1
    
    # counts key entity labels
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    for label in ent_labels:
        if label in contextual_keywords: contextual_keyword_count += 1
        elif label in ['CARDINAL']: cardinality += 1
        
    return specific_keyword_count, numeric_keyword_count, contextual_keyword_count, cardinality

In [455]:
sentsdf['SKC'],  sentsdf['NKC'], sentsdf['CKC'], sentsdf['Cardinality'] = zip(*sentsdf['Sentence'].apply(countKeywords))
sentsdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
0,"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.",0,0,2,0
1,"In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy.",0,1,0,0
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.",1,1,0,0
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.,2,0,2,0
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",3,2,0,0
6,"For the early stages of the outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of 3.38 ± 0.69 days.",2,3,3,2
7,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections.",1,2,3,2
8,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.,2,1,0,0
9,"We anticipate that our model will become a valuable tool to estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in place, and travel restrictions for low-risk subgroups of the population or for the population as a whole.",0,2,0,0


In [456]:
# An example of filtering
sentsdf_filt = sentsdf.sort_values(by="Cardinality", ascending=False)
sentsdf_filt

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
6,"For the early stages of the outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of 3.38 ± 0.69 days.",2,3,3,2
7,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections.",1,2,3,2
0,"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.",0,0,2,0
1,"In an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and long-term impact of this pandemic on the health system and the global economy.",0,1,0,0
2,"However, the precise timeline of the disease, its transmissibility, and the efect of mitigation strategies remain incompletely understood.",1,1,0,0
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.,2,0,2,0
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",3,2,0,0
8,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.,2,1,0,0
9,"We anticipate that our model will become a valuable tool to estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in place, and travel restrictions for low-risk subgroups of the population or for the population as a whole.",0,2,0,0


In [457]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by='SKC',ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
5,"We postulate that the latent and infectious periods are disease-specifc, whereas the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",3,2,0,0
3,Here we integrate a global network model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.,2,0,2,0
6,"For the early stages of the outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of 3.38 ± 0.69 days.",2,3,3,2
8,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics and provide decision guidelines for successful outbreak control.,2,1,0,0


In [458]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def displaySurroundingText(idx, df = sentsdf, pm = 1):
    n = len(df)
    if (idx-pm) < 0 and (idx+pm) > n:
        display(df)
        return
    elif (idx-pm) < 0:
        display(df.loc[0:idx+pm,:])
        return
    elif (idx-pm) > n:
        display(df.loc[idx-pm:n,:])
        return
    display(df.loc[idx-pm:idx+pm,:])
    return

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf_top):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf_top.head()):
    return sum(df.SKC) / len(df) * 10.000

In [459]:
calculateRelevance()

26.0

In [460]:
df1 = sentencesWith()
df1.loc[df1.Cardinality > 0]

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
4,"For the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, and an infectious period of 17.82 ± 2.95 days.",4,6,3,4
6,"For the early stages of the outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of 3.38 ± 0.69 days.",2,3,3,2
7,"Our network model predicts that—without the massive political mitigation strategies that are in place today— the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May 10, 2020 with 3 million infections.",1,2,3,2


In [461]:
import numpy as np

f_out = open("testOutput.txt", "w")

dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction",
    "incubation":"incubation",
    "r0":"reproduction",
}
statsdf = pd.DataFrame([],columns=['Parameter','Estimates','n','Citation','Rule'])
print(statsdf)

from spacy.matcher import Matcher
bigMatcher = Matcher(nlp.vocab)
smallMatcher = Matcher(nlp.vocab)
smallMatcher.add("n_rule1",None,[{'TEXT':'n'},{'TEXT':'='},{'LIKE_NUM':True}])
idx = None     

def find_n(doc):
    n_match = smallMatcher(doc)
    if n_match:
        match_id, start, end = n_match[0]
        return int(doc[end-1].text)
    return np.NaN
    
def funnel_values(estimates, parameter, rule_name, doc):
    global statsdf
    
    n = find_n(doc) #could make this faster
    # Parameter IS in dictionary
    if parameter:
        # Checking if estimates already included, in which case doesn't add a new row
        if not ((statsdf['Parameter'] == parameter) &\
             (statsdf['Estimates'] == estimates)).any():
            new_row = {'Parameter':parameter,\
                       'Estimates':estimates,\
                       'n':n,\
                       'Citation':idx,\
                       'Rule':rule_name}
            statsdf = statsdf.append(new_row,ignore_index=True)
         
    # Parameter is NOT in dictionary
    elif idx not in list(statsdf.Citation) and\
        estimates not in list(statsdf.Estimates.loc[statsdf.Citation == idx]):
        statsdf.loc[len(statsdf)] = [parameter, estimates, n, idx, rule_name]
        
def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map", doc)

        
def bw_map(matcher, doc, id, matches):
    for match_id, start, end in matches:  
        lower = round(float(doc[start].text),2)
        upper = round(float(doc[end-1].text),2)
        
        estimates = (lower,upper)
        parameter = dict_map.get(str(doc[0]))
        #f_out.write(doc.text + "(SRC: BW_MAP)")
        
        funnel_values(estimates, parameter, "bw_map", doc)

# days_map finds matches for "# days", then searches the rest of the sentences' tokens for specific keywords to
# store as the corresponding parameter.
def days_map(matcher, doc, id, matches):
    for match_id, start, end in matches: 
        est_token = doc[end-2]
        estimate = est_token.text
        parameter = "None"
        for token in doc:
            closestIdx = abs(0 - est_token.i)
            # specReg is the regex for specific keywords
            if re.search(specReg, token.text.lower()):
                proximity_to_est = abs(token.i - est_token.i)
                # find the closest keyword to our estimate, as long as its within an arbitrary range (i said 15)
                if (proximity_to_est < closestIdx) & (proximity_to_est < 15):
                    closestIdx = token.i
                    parameter = token.text.lower()
        funnel_values(estimate, parameter, "days_map", doc)
        
# and_map works like days map, but for ranges specified with "and"
def and_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        lower_est_token = doc[start]
        lower = round(float(doc[start].text),2)
        upper = round(float(doc[end-1].text),2)
        
        estimates = (lower,upper)
        parameter = "None"
        for token in doc:
            closestIdx = abs(0 - lower_est_token.i)
            if re.search(specReg, token.text.lower()):
                proximity_to_est = abs(token.i - lower_est_token.i)
                if (proximity_to_est < closestIdx) & (proximity_to_est < 15):
                    closestIdx = token.i
                    parameter = token.text.lower()
        f_out.write(parameter)
        funnel_values(estimates, dict_map.get(parameter), "and_map", doc)

Empty DataFrame
Columns: [Parameter, Estimates, n, Citation, Rule]
Index: []


In [462]:
pm_rule = [{"IS_ALPHA":True},{"IS_ALPHA":True},{"IS_ALPHA":True},\
           {"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]
bw_rule = [{"LIKE_NUM":True}, {"TEXT":"–"}, {"LIKE_NUM":True}]
days_rule = [{"IS_ALPHA":True}, {"IS_ALPHA":True}, {"LIKE_NUM":True}, {"TEXT":"days"}]
and_rule = [{"LIKE_NUM":True}, {"TEXT":"and"}, {"LIKE_NUM":True}]

bigMatcher.add("pm_rule", pm_map, pm_rule)
bigMatcher.add("bw_rule", bw_map, bw_rule)
bigMatcher.add("and_rule", and_map, and_rule)
bigMatcher.add("days_rule", days_map, days_rule)

In [463]:
statsdf = statsdf.iloc[0:0]

sents_filt = sentencesWith(specific_keywords)
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    doc = nlp(sentence)
    matches = bigMatcher(doc)

In [464]:
# Order the Parameters column variables so that "None" is ranked lowest (for readability)
from pandas.api.types import CategoricalDtype


statsparams = list(statsdf.Parameter)
# If our set of parameters has no duplicates (does not apply to sample abstract, which stores values across countries)
# We probably should figure out how to separate the data from a paper if it deals with multiple countries?
if len(statsparams) == len(set(statsparams)):
    if ("None" in statsparams):
        statsparams.append(statsparams.pop(statsparams.index("None")))
    statsdf["Parameter"].astype(CategoricalDtype(categories=statsparams, ordered=True))
    # Sorted stats dataframe:
    statsdf = statsdf.sort_values('Parameter', ascending=False)

statsdf

Unnamed: 0,Parameter,Estimates,n,Citation,Rule
0,latency,"(1.84, 3.2800000000000002)",30.0,4,pm_map
1,contact,"(1.15, 1.79)",30.0,4,pm_map
2,infectious,"(14.870000000000001, 20.77)",30.0,4,pm_map
3,contact,"(2.69, 4.07)",50.0,6,pm_map
4,reproduction,"(4.35, 6.25)",,7,pm_map
