In [1]:
import spacy
from spacy.matcher import Matcher
import numpy as np
from spacy import displacy                                                 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import re

  


In [2]:
# Loads NLP English model
nlp = spacy.load('en')

In [3]:
# Reads text
file = open('random_full.txt','r')
text = file.read()
text

"Abstract\nInfluenza infection natural history is often described as a progression through four successive stages: Susceptible–Exposed/Latent–Infectious–Removed (SEIR). The duration of each stage determines the average generation time, the time between infection of a case and infection of his/her infector.\n\nRecently, several authors have justified somewhat arbitrary choices in stage durations by how close the resulting generation time distribution was to viral excretion over time after infection. Taking this reasoning one step further, we propose that the viral excretion profile over time can be used directly to estimate the required parameters in an SEIR model. In our approach, the latency and infectious period distributions are estimated by minimizing the Kullback–Leibler divergence between the model-based generation time probability density function and the normalized average viral excretion profile.\n\nFollowing this approach, we estimated that the latency and infectious period l

In [4]:
# Convert text into NLP object
textdoc = nlp(text)

In [5]:
# Named Entity Recognition
#displacy.serve(doc, style='ent')

In [6]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents

['Abstract\nInfluenza infection natural history is often described as a progression through four successive stages: Susceptible–Exposed/Latent–Infectious–Removed (SEIR).',
 'The duration of each stage determines the average generation time, the time between infection of a case and infection of his/her infector.\n\n',
 'Recently, several authors have justified somewhat arbitrary choices in stage durations by how close the resulting generation time distribution was to viral excretion over time after infection.',
 'Taking this reasoning one step further, we propose that the viral excretion profile over time can be used directly to estimate the required parameters in an SEIR model.',
 'In our approach, the latency and infectious period distributions are estimated by minimizing the Kullback–Leibler divergence between the model-based generation time probability density function and the normalized average viral excretion profile.\n\n',
 'Following this approach, we estimated that the latency 

In [7]:
# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,Abstract\nInfluenza infection natural history is often described as a progression through four successive stages: Susceptible–Exposed/Latent–Infectious–Removed (SEIR).
1,"The duration of each stage determines the average generation time, the time between infection of a case and infection of his/her infector.\n\n"
2,"Recently, several authors have justified somewhat arbitrary choices in stage durations by how close the resulting generation time distribution was to viral excretion over time after infection."
3,"Taking this reasoning one step further, we propose that the viral excretion profile over time can be used directly to estimate the required parameters in an SEIR model."
4,"In our approach, the latency and infectious period distributions are estimated by minimizing the Kullback–Leibler divergence between the model-based generation time probability density function and the normalized average viral excretion profile.\n\n"
...,...
267,"If reliable data on the dynamics of contacts after infection were available, these data could, together with data on viral excretion profiles, lead to a more accurate description of the generation time distribution."
268,Our method could then be applied to estimate the corresponding durations of latency and infectious periods.\n\n
269,"Finally, a natural continuation of this work, which would require individual data on viral excretion profiles after infection, would be to compare the stochastic SEIR model, calibrated using our method, to the stochastic model in which each individual is assumed infectious proportionally to his/her viral load."
270,"While, as we have shown, the dynamics of the deterministic models are the same, the dynamics of the stochastic versions should be different."


In [8]:
# Listing keywords
general_numerical_keywords = ['time','number*','ratio','proportion','period','±','total*','estimate*','%']
specific_numerical_keywords = ['infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
trait_keywords = general_numerical_keywords + specific_numerical_keywords
trait_keywords_regex = '|'.join(trait_keywords)
trait_keywords_regex

'time|number*|ratio|proportion|period|±|total*|estimate*|%|infections|death*|transmis*|laten*|contact|infectious|incubat*|casualties|mortal*|morbid*|outbreak*'

In [9]:
'''
Parses a sentence, looking for trait-related keywords.

args: sentence - string of sentence.
return: trait_keyword_match_count - the number how many trait-related keywords the sentence contains.
'''
def countTraitKeywords(sentence):
    sentence_obj = nlp(sentence)
    trait_keyword_match_count = len(re.findall(trait_keywords_regex, str(sentence_obj)))
    return trait_keyword_match_count

'''
Parses a sentence, looking for context-related keywords.

args: sentence - string of sentence.
return: trait_keyword_match_count - the number how many context-related keywords the sentence contains.
'''
def countContextualKeywords(sentence):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    contextual_keyword_match_count = len([label for label in ent_labels if label in contextual_keywords])
    return contextual_keyword_match_count

'''
Parses a sentence, counting occurences of cardinal elements.

args: sentence - string of sentence.
return: numericness - how many numbers the sentence contains.
'''
def findCardinality(sentence):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    cardinality = len([label for label in ent_labels if label == 'CARDINAL'])
    return cardinality

In [10]:
sentsdf['TKMC'] = sentsdf.Sentence.map(countTraitKeywords)
sentsdf['CKMC'] = sentsdf.Sentence.map(countContextualKeywords)
sentsdf['SKMC'] = sentsdf['TKMC'] + sentsdf['CKMC']
sentsdf['Cardinality'] = sentsdf.Sentence.map(findCardinality)

# My attempt at guessing how many 'numeric' each sentence is
sentsdf['Numericness'] = sentsdf.TKMC * sentsdf.Cardinality

In [11]:
sentsdf

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Cardinality,Numericness
0,Abstract\nInfluenza infection natural history is often described as a progression through four successive stages: Susceptible–Exposed/Latent–Infectious–Removed (SEIR).,0,0,0,1,0
1,"The duration of each stage determines the average generation time, the time between infection of a case and infection of his/her infector.\n\n",4,0,4,0,0
2,"Recently, several authors have justified somewhat arbitrary choices in stage durations by how close the resulting generation time distribution was to viral excretion over time after infection.",4,0,4,0,0
3,"Taking this reasoning one step further, we propose that the viral excretion profile over time can be used directly to estimate the required parameters in an SEIR model.",2,0,2,1,2
4,"In our approach, the latency and infectious period distributions are estimated by minimizing the Kullback–Leibler divergence between the model-based generation time probability density function and the normalized average viral excretion profile.\n\n",6,0,6,0,0
...,...,...,...,...,...,...
267,"If reliable data on the dynamics of contacts after infection were available, these data could, together with data on viral excretion profiles, lead to a more accurate description of the generation time distribution.",3,0,3,0,0
268,Our method could then be applied to estimate the corresponding durations of latency and infectious periods.\n\n,5,0,5,0,0
269,"Finally, a natural continuation of this work, which would require individual data on viral excretion profiles after infection, would be to compare the stochastic SEIR model, calibrated using our method, to the stochastic model in which each individual is assumed infectious proportionally to his/her viral load.",2,0,2,0,0
270,"While, as we have shown, the dynamics of the deterministic models are the same, the dynamics of the stochastic versions should be different.",0,0,0,0,0


In [12]:
# An example of filtering
sentsdf_filt = sentsdf.loc[(sentsdf.SKMC >= 3) & (sentsdf.Cardinality >=2)]
print(len(sentsdf_filt))
sentsdf_filt

5


Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Cardinality,Numericness
33,"For example, recent publications used a range of mean latency period from 0.64 (Fraser et al., 2009) to 3.0 (Pourbohloul et al., 2009) days and from 1.27 (Fraser et al., 2009) to 8.0 (",2,4,6,4,8
132,"To estimate the variability of parameter estimates, we used a leave one out (jackknife) approach (Shao, 2003): we obtained 12 different average viral excretion profiles by systematically omitting one of the 12 studies (and thereby removing one of the 12 average profiles of viral excretion) considered by Carrat et al.",2,1,3,5,10
138,We considered values of the average infectious period between 0.25 and 5.75 days (step 0.25) and values of the average latency period between 0.25 and 3.25 days (step 0.25).,4,2,6,2,8
172,"0.26 ± 0.08 days and the infectious period distribution average 0.99 ± 0.25 days and sd 0.96 ± 0.15 days, as shown in Fig.",5,2,7,3,15
174,"The infectious period is typically short, with 95% (respectively 99%) of cases having an infectious period shorter than 2.90 ± 0.55 days (respectively 4.41 ± 0.63 days).\n\n\n",8,1,9,2,16


In [13]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by=['TKMC'],ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Cardinality,Numericness
163,"Using the estimated infectious period distribution, we computed the proportion of secondary infections which could be avoided if all symptomatic cases were isolated (or treated with a treatment reducing infectivity by 100%) at time σ after onset of symptoms (assumed equal to the start of infectious period), and for a duration τ.\n\n",11,0,11,0,0
221,"(2010), the only other study where a realistic generation time distribution was used and latency and infectious period were estimated: their estimated average latency period was 1.40 days (versus 0.99 in our study), with a 95% probability interval of [0.19–3.90]",10,2,12,1,10
38,"Indeed, detailed studies of influenza transmission are scarce; direct observation of the duration of latency or infectious period is impossible and nontrivial statistical analyses are necessary to estimate the time course of infectivity from observed chains of transmission.",8,0,8,0,0
245,"Based on our new estimate of the infectious period distribution, and under the optimistic but commonly adopted scenario where symptoms onset coincides with the beginning of the infectious period, in order to achieve a 50% reduction in the number of secondary cases, it is necessary to isolate or treat index cases at most within 16 h after symptoms onset.",8,0,8,1,8
174,"The infectious period is typically short, with 95% (respectively 99%) of cases having an infectious period shorter than 2.90 ± 0.55 days (respectively 4.41 ± 0.63 days).\n\n\n",8,1,9,2,16


In [14]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def displaySurroundingText(idx, df = sentsdf, pm = 1):
    n = len(df)
    if (idx-pm) < 0 and (idx+pm) > n:
        display(df)
        return
    elif (idx-pm) < 0:
        display(df.loc[0:idx+pm,:])
        return
    elif (idx-pm) > n:
        display(df.loc[idx-pm:n,:])
        return
    display(df.loc[idx-pm:idx+pm,:])
    return

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf_top):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf_top.head()):
    return sum(df.TKMC) / len(df) * 10.000

In [15]:
calculateRelevance()

90.0

In [16]:
df1 = sentencesWith()
df1.loc[df1.Cardinality > 0]

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Cardinality,Numericness
221,"(2010), the only other study where a realistic generation time distribution was used and latency and infectious period were estimated: their estimated average latency period was 1.40 days (versus 0.99 in our study), with a 95% probability interval of [0.19–3.90]",10,2,12,1,10
245,"Based on our new estimate of the infectious period distribution, and under the optimistic but commonly adopted scenario where symptoms onset coincides with the beginning of the infectious period, in order to achieve a 50% reduction in the number of secondary cases, it is necessary to isolate or treat index cases at most within 16 h after symptoms onset.",8,0,8,1,8
174,"The infectious period is typically short, with 95% (respectively 99%) of cases having an infectious period shorter than 2.90 ± 0.55 days (respectively 4.41 ± 0.63 days).\n\n\n",8,1,9,2,16
207,"Using time profiles of viral excretion in subjects experimentally infected with influenza, we have estimated that the mean latency and infectious period for influenza are 1.6 (95%CI 1.5–1.7) and 1.0 days (95%CI 0.5–1.7).",7,2,9,1,7
77,"The natural history of influenza described by these two models will be different at the individual level, as the first assumes a constant hazard of transmission during a defined infectious period, while the other proposes time dependent hazard of transmission related to viral excretion.",6,0,6,1,6
105,"A common description of transmission models is that infected individuals make infectious contacts in time according to a time inhomogeneous Poisson process with intensity λk(t), where λ is a random amount of infectivity, and k(t)",5,1,6,1,5
217,"1 shows that the average duration of the latency period found in our study compares with that generally used for modeling influenza, although our distribution of the latency period is narrower than most distributions used in the literature.",5,0,5,1,5
172,"0.26 ± 0.08 days and the infectious period distribution average 0.99 ± 0.25 days and sd 0.96 ± 0.15 days, as shown in Fig.",5,2,7,3,15
8,"We estimate that, under a best-case scenario where symptoms appear at the end of the latency period, index cases must be isolated or treated at most within 16 h after symptoms onset to avoid 50% of secondary cases.\n\n",5,0,5,1,5
119,"From those equations, it is clear that two epidemics in the same population, with the same reproduction number, the same generation time distribution, and the same initial conditions, will have exactly the same dynamics of incidence and number of susceptibles.\n\n",4,0,4,1,4


In [48]:
dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction"
}
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reproduction,,,
contact,,,
latency,,,
infectious,,,


In [45]:
matcher = Matcher(nlp.vocab)
idx = None
        

def funnel_values(estimates, parameter, rule_name):
    global stats_df
    if parameter:
        stats_df.at[parameter,'Estimates'] = estimates
        stats_df.at[parameter,'Citation'] = idx
        stats_df.at[parameter,'Rule'] = rule_name
    elif idx not in list(stats_df.Citation) and\
        estimates not in list(stats_df.Estimates.loc[stats_df.Citation == idx]) :
        stats_df.loc[len(stats_df)] = [estimates, idx, rule_name]

def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map")

        
def bw_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split('–')
        
        lower = round(float(split_span[0]),2)
        upper = round(float(split_span[1]),2)
        
        estimates = (lower,upper)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "bw_map")

In [46]:
pm_rule = [{"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]
bw_rule = [{"LIKE_NUM":True}, {"TEXT":"–"}, {"LIKE_NUM":True}]

matcher.add("pm_rule", pm_map, pm_rule)
matcher.add("bw_rule", bw_map, bw_rule)

In [47]:
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

sents_filt = sentencesWith(specific_numerical_keywords)
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    sentence = sentence.replace("–"," – ")
    doc = nlp(sentence)
    matches = matcher(doc)
stats_df

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reproduction,,,
contact,,,
latency,,,
infectious,,,
4,"(0.19, 3.9)",221.0,bw_map
5,"(2.3499999999999996, 3.45)",174.0,pm_map
6,"(1.5, 1.7)",207.0,bw_map
7,"(0.18, 0.34)",172.0,pm_map
8,"(1.5699999999999998, 1.69)",171.0,pm_map


In [50]:
displaySurroundingText(207)

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Cardinality,Numericness
206,"Here, we proposed that viral excretion profile can be linked to the generation time distribution in the standard SEIR model, and we estimated parameters under this assumption.\n\n",3,0,3,0,0
207,"Using time profiles of viral excretion in subjects experimentally infected with influenza, we have estimated that the mean latency and infectious period for influenza are 1.6 (95%CI 1.5–1.7) and 1.0 days (95%CI 0.5–1.7).",7,2,9,1,7
208,"In particular, we found that 95% of cases were infectious for less than 2.9 days.",2,1,3,0,0


In [41]:
senten

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Cardinality,Numericness
233,(2005) compared infectivity profiles with viral excretion on the natural scale.,0,1,1,0,0
234,"Quantitatively, we found that the difference was not negligible: the mean GT was 3.5 days (Jackknife 95% CI 3.2–3.9 days) using the normalized viral excretion on the log scale, but only 2.6 days (Jackknife 95% CI 2.4–2.8 days) using the natural scale.",2,4,6,0,0
235,Although the former option leads to a generation interval in agreement with data reported in a recent study where the average GT was 3.6 days (95% CI 2.9–4.3 days),2,2,4,0,0
