In [46]:
import spacy
from spacy.matcher import Matcher
import numpy as np
from spacy import displacy                                                 
import pandas as pd
pd.set_option('display.max_colwidth', None)
import re

In [47]:
# Loads NLP English model
nlp = spacy.load('en')

In [49]:
# Reads text
file = open('abstract_example','r')
text = file.read()
text

'"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic. In \\\nan unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \\\nlong-term impact of this pandemic on the health system and the global economy. However, the precise timeline of the disease, \\\nits transmissibility, and the efect of mitigation strategies remain incompletely understood. Here we integrate a global network \\\nmodel with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States. For \\\nthe outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \\\nand an infectious period of 17.82 ± 2.95 days. We postulate that the latent and infectious periods are disease-specifc, whereas \\\nthe contact period is behavior-specifc and can vary between diferent provinces, states, or countr

In [57]:
# Convert text into NLP object
textdoc = nlp(text)

In [58]:
# Named Entity Recognition
displacy.render(textdoc, style='ent')

In [59]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents[:5]

['"On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.',
 'In \\\nan unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \\\nlong-term impact of this pandemic on the health system and the global economy.',
 'However, the precise timeline of the disease, \\\n',
 'its transmissibility, and the efect of mitigation strategies remain incompletely understood.',
 'Here we integrate a global network \\\nmodel with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.']

In [60]:
# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,"""On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic."
1,"In \\nan unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \\nlong-term impact of this pandemic on the health system and the global economy."
2,"However, the precise timeline of the disease, \\n"
3,"its transmissibility, and the efect of mitigation strategies remain incompletely understood."
4,Here we integrate a global network \\nmodel with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.
5,"For \\nthe outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \\n"
6,and an infectious period of 17.82 ± 2.95 days.
7,"We postulate that the latent and infectious periods are disease-specifc, whereas \\n"
8,"the contact period is behavior-specifc and can vary between diferent provinces, states, or countries."
9,For the early stages of the \\n


In [61]:
# Listing keywords
numeric_keywords = ['time','number*','ratio','proportion','period','±','total*','estimate*','%']
specific_keywords = ['infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
numeric_regex = '|'.join(numeric_keywords)
specific_regex = '|'.join(numeric_keywords)

trait_keywords = numeric_keywords + specific_keywords
trait_regex = '|'.join(trait_keywords)
trait_regex

'time|number*|ratio|proportion|period|±|total*|estimate*|%|infections|death*|transmis*|laten*|contact|infectious|incubat*|casualties|mortal*|morbid*|outbreak*'

In [62]:
'''
Parses a sentence, looking for keywords.

args: sentence - string of sentence.
return: specific_keyword_count - total number of specific keywords in sentence
        numeric_keyword_count - total number of numeric keywords in sentence
        contextual_keyword_count - total number of contextual entity labels in sentence
        cardinality - total number of numeric entity labels in sentence
'''
def countKeywords(sentence):
    specific_keyword_count = numeric_keyword_count = contextual_keyword_count = cardinality = 0
    
    # counts specific and numeric keywords first
    for word in sentence.split():
        if re.match(specific_regex, word): specific_keyword_count += 1
        elif re.match(numeric_regex, word): numeric_keyword_count += 1
    
    # counts key entity labels
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    for label in ent_labels:
        if label in contextual_keywords: contextual_keyword_count += 1
        elif label in ['CARDINAL']: cardinality += 1
        
    return specific_keyword_count, numeric_keyword_count, contextual_keyword_count, cardinality

In [63]:
sentsdf['SKC'],  sentsdf['NKC'], sentsdf['CKC'], sentsdf['Cardinality'] = zip(*sentsdf['Sentence'].apply(countKeywords))
sentsdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
0,"""On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.",0,0,2,0
1,"In \\nan unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \\nlong-term impact of this pandemic on the health system and the global economy.",1,0,0,0
2,"However, the precise timeline of the disease, \\n",1,0,0,0
3,"its transmissibility, and the efect of mitigation strategies remain incompletely understood.",0,0,0,0
4,Here we integrate a global network \\nmodel with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States.,0,0,2,0
5,"For \\nthe outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \\n",4,0,2,3
6,and an infectious period of 17.82 ± 2.95 days.,2,0,1,1
7,"We postulate that the latent and infectious periods are disease-specifc, whereas \\n",1,0,0,0
8,"the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",1,0,0,0
9,For the early stages of the \\n,0,0,0,0


In [64]:
# An example of filtering
sentsdf_filt = sentsdf.sort_values(by="Cardinality", ascending=False)
sentsdf_filt

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
5,"For \\nthe outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \\n",4,0,2,3
10,"outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of \\n",1,0,2,1
14,"\\n10, 2020 with 3 million infections.",0,0,1,1
13,the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May,2,0,2,1
6,and an infectious period of 17.82 ± 2.95 days.,2,0,1,1
11,3.38 ± 0.69 days.,1,0,0,1
0,"""On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic.",0,0,2,0
15,Our results demonstrate how mathematical modeling can help estimate outbreak dynamics \\nand provide decision guidelines for successful outbreak control.,1,0,0,0
12,Our network model predicts that—without the massive political mitigation strategies that are in place today— \\n,0,0,1,0
8,"the contact period is behavior-specifc and can vary between diferent provinces, states, or countries.",1,0,0,0


In [65]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by='SKC',ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
5,"For \\nthe outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \\n",4,0,2,3
16,"We anticipate that our model will become a valuable tool to \\nestimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in \\nplace, and travel restrictions for low-risk subgroups of the population or for the population as a whole.""\n",2,0,0,0
13,the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May,2,0,2,1
6,and an infectious period of 17.82 ± 2.95 days.,2,0,1,1
1,"In \\nan unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \\nlong-term impact of this pandemic on the health system and the global economy.",1,0,0,0


In [66]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def displaySurroundingText(idx, df = sentsdf, pm = 1):
    n = len(df)
    if (idx-pm) < 0 and (idx+pm) > n:
        display(df)
        return
    elif (idx-pm) < 0:
        display(df.loc[0:idx+pm,:])
        return
    elif (idx-pm) > n:
        display(df.loc[idx-pm:n,:])
        return
    display(df.loc[idx-pm:idx+pm,:])
    return

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf_top):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf_top.head()):
    return sum(df.SKC) / len(df) * 10.000

In [67]:
calculateRelevance()


22.0

In [68]:
df1 = sentencesWith()
df1.loc[df1.Cardinality > 0]

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
5,"For \\nthe outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \\n",4,0,2,3
13,the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May,2,0,2,1
6,and an infectious period of 17.82 ± 2.95 days.,2,0,1,1
11,3.38 ± 0.69 days.,1,0,0,1
10,"outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of \\n",1,0,2,1
14,"\\n10, 2020 with 3 million infections.",0,0,1,1


In [69]:
dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction"
}
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reproduction,,,
latency,,,
contact,,,
infectious,,,


In [70]:
matcher = Matcher(nlp.vocab)
idx = None
        

def funnel_values(estimates, parameter, rule_name):
    global stats_df
    if parameter:
        stats_df.at[parameter,'Estimates'] = estimates
        stats_df.at[parameter,'Citation'] = idx
        stats_df.at[parameter,'Rule'] = rule_name
    elif idx not in list(stats_df.Citation) and\
        estimates not in list(stats_df.Estimates.loc[stats_df.Citation == idx]) :
        stats_df.loc[len(stats_df)] = [estimates, idx, rule_name]

def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map")

        
def bw_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split('–')
        
        lower = round(float(split_span[0]),2)
        upper = round(float(split_span[1]),2)
        
        estimates = (lower,upper)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "bw_map")

In [71]:
pm_rule = [{"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]
bw_rule = [{"LIKE_NUM":True}, {"TEXT":"–"}, {"LIKE_NUM":True}]

matcher.add("pm_rule", pm_map, pm_rule)
matcher.add("bw_rule", bw_map, bw_rule)

In [72]:
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

sents_filt = sentencesWith(trait_keywords)
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    sentence = sentence.replace("–"," – ")
    doc = nlp(sentence)
    matches = matcher(doc)
stats_df

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reproduction,,,
latency,,,
contact,,,
infectious,,,
4,"(1.84, 3.2800000000000002)",5.0,pm_map
5,"(4.35, 6.25)",13.0,pm_map
6,"(14.870000000000001, 20.77)",6.0,pm_map
7,"(2.69, 4.07)",11.0,pm_map


In [43]:
if re.match(trait_regex,"tim"): print(1)

In [44]:
x = nlp('US is Canada')
x

US is Canada

In [45]:
x.ents

(US, Canada)