In [20]:
import spacy    
import unicodedata
import re

In [21]:
# Loads NLP English model
nlp = spacy.load('en')

In [22]:
oliver_testpaper = 'data/texts/ebola/10.1007/s00705-020-04768-3.txt'

# Reads text
if 1:
    with open(oliver_testpaper, 'r') as file:
        text = unicodedata.normalize("NFKD", file.read().replace('\n', ' ')) # replaces \xa0 with " "
        text = re.sub(r'(?<=[.])(?=[^\s])', r' ', text) # spaces out concatenated sentences
else:
    text = "On March 11, 2020, the World Health Organization declared the coronavirus disease 2019, COVID-19, a global pandemic. In \
an unprecedented collective efort, massive amounts of data are now being collected worldwide to estimate the immediate and \
long-term impact of this pandemic on the health system and the global economy. However, the precise timeline of the disease, \
its transmissibility, and the efect of mitigation strategies remain incompletely understood. Here we integrate a global network \
model with a local epidemic SEIR model to quantify the outbreak dynamics of COVID-19 in China and the United States. For \
the outbreak in China, in n = 30 provinces, we found a latent period of 2.56 ± 0.72 days, a contact period of 1.47 ± 0.32 days, \
and an infectious period of 17.82 ± 2.95 days. We postulate that the latent and infectious periods are disease-specifc, whereas \
the contact period is behavior-specifc and can vary between diferent provinces, states, or countries. For the early stages of the \
outbreak in the United States, in n = 50 states, we adopted the disease-specifc values from China and found a contact period of \
3.38 ± 0.69 days. Our network model predicts that—without the massive political mitigation strategies that are in place today— \
the United States would have faced a basic reproduction number of 5.30 ± 0.95 and a nationwide peak of the outbreak on May \
10, 2020 with 3 million infections. Our results demonstrate how mathematical modeling can help estimate outbreak dynamics \
and provide decision guidelines for successful outbreak control. We anticipate that our model will become a valuable tool to \
estimate the potential of vaccination and quantify the efect of relaxing political measures including total lockdown, shelter in \
place, and travel restrictions for low-risk subgroups of the population or for the population as a whole."

In [23]:
# Convert text into NLP object
textdoc = nlp(text)

In [24]:
# Named Entity Recognition
from spacy import displacy 
displacy.render(textdoc, style='ent')

In [25]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents[:5]

['Potential strategies for combating COVID-19  Abstract Coronavirus disease 2019, also known as COVID-19, is caused by a novel coronavirus named severe acute respiratory syndrome coronavirus 2, or SARS-CoV-2.',
 'The infection has now catapulted into a full-blown pandemic across the world, which has affected more than 2 million people and has led to approximately 150,000 fatalities all over the world (WHO).',
 'In this review, we elaborate all currently available data that shed light on possible methods for treatment of COVID-19, such as antiviral drugs, corticosteroids, convalescent plasma, and potentially effective vaccines.',
 'Additionally, ongoing and discontinued clinical trials that have been carried out for validating probable treatments for COVID-19 are discussed.',
 'The review also elaborates the prospective approach and the possible advantages of using convalescent plasma and stem cells for the improvement of clinical symptoms and meeting the demand for an instantaneous cur

In [26]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,"Potential strategies for combating COVID-19 Abstract Coronavirus disease 2019, also known as COVID-19, is caused by a novel coronavirus named severe acute respiratory syndrome coronavirus 2, or SARS-CoV-2."
1,"The infection has now catapulted into a full-blown pandemic across the world, which has affected more than 2 million people and has led to approximately 150,000 fatalities all over the world (WHO)."
2,"In this review, we elaborate all currently available data that shed light on possible methods for treatment of COVID-19, such as antiviral drugs, corticosteroids, convalescent plasma, and potentially effective vaccines."
3,"Additionally, ongoing and discontinued clinical trials that have been carried out for validating probable treatments for COVID-19 are discussed."
4,The review also elaborates the prospective approach and the possible advantages of using convalescent plasma and stem cells for the improvement of clinical symptoms and meeting the demand for an instantaneous cure.
...,...
477,"Although extensive research is being conducted to develop a suitable vaccine against COVID-19, there is a dire need to shift a major part of the ongoing research towards the treatment of pneumonia in patients, which is often fatal."
478,The astounding data regarding mesenchymal stem cells offers a hopeful approach to the use of an endogenous pathway for the treatment of disease.
479,"In the case of pneumonia caused by COVID-19, the balance between the antiviral response and the regulation of LIF action against a cytokine storm may be lost, which can reduce the overall beneficial effect of therapy."
480,"Nevertheless, the study findings indicating the safe and effective use of mesenchymal stem cell therapy in treating COVID-19 are indeed remarkable and present a modern approach to the efficient and safe treatment of critically ill patients."


In [27]:
# Listing keywords
numeric_keywords = ['distribution*','time','number*','ratio','proportion','period','±','total*','estimate*','%','review','parameter*','mean','period','value']
specific_keywords = ['r0','reproduct*''infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*','epideme*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
numeric_regex = '|'.join(numeric_keywords)
specific_regex = '|'.join(specific_keywords)

trait_keywords = numeric_keywords + specific_keywords
trait_regex = '|'.join(trait_keywords)
trait_regex

numReg = re.compile(numeric_regex)
specReg = re.compile(specific_regex)
traitReg = re.compile(trait_regex)

In [28]:
import re

'''
Parses a sentence, looking for keywords.

args: sentence - string of sentence.
return: specific_keyword_count - total number of specific keywords in sentence
        numeric_keyword_count - total number of numeric keywords in sentence
        contextual_keyword_count - total number of contextual entity labels in sentence
        cardinality - total number of numeric entity labels in sentence
'''
def countKeywords(sentence):
    specific_keyword_count = numeric_keyword_count = contextual_keyword_count = cardinality = 0
    
    # counts specific and numeric keywords first
    for word in sentence.split():
        if re.match(specific_regex, word): specific_keyword_count += 1
        elif re.match(numeric_regex, word): numeric_keyword_count += 1
    
    # counts key entity labels
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    for label in ent_labels:
        if label in contextual_keywords: contextual_keyword_count += 1
        elif label in ['CARDINAL']: cardinality += 1
        
    return specific_keyword_count, numeric_keyword_count, contextual_keyword_count, cardinality

In [29]:
sentsdf['SKC'],  sentsdf['NKC'], sentsdf['CKC'], sentsdf['Cardinality'] = zip(*sentsdf['Sentence'].apply(countKeywords))
sentsdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
0,"Potential strategies for combating COVID-19 Abstract Coronavirus disease 2019, also known as COVID-19, is caused by a novel coronavirus named severe acute respiratory syndrome coronavirus 2, or SARS-CoV-2.",0,0,1,1
1,"The infection has now catapulted into a full-blown pandemic across the world, which has affected more than 2 million people and has led to approximately 150,000 fatalities all over the world (WHO).",0,0,0,2
2,"In this review, we elaborate all currently available data that shed light on possible methods for treatment of COVID-19, such as antiviral drugs, corticosteroids, convalescent plasma, and potentially effective vaccines.",0,1,0,0
3,"Additionally, ongoing and discontinued clinical trials that have been carried out for validating probable treatments for COVID-19 are discussed.",0,0,0,0
4,The review also elaborates the prospective approach and the possible advantages of using convalescent plasma and stem cells for the improvement of clinical symptoms and meeting the demand for an instantaneous cure.,0,1,0,0
...,...,...,...,...,...
477,"Although extensive research is being conducted to develop a suitable vaccine against COVID-19, there is a dire need to shift a major part of the ongoing research towards the treatment of pneumonia in patients, which is often fatal.",0,0,0,0
478,The astounding data regarding mesenchymal stem cells offers a hopeful approach to the use of an endogenous pathway for the treatment of disease.,0,0,0,0
479,"In the case of pneumonia caused by COVID-19, the balance between the antiviral response and the regulation of LIF action against a cytokine storm may be lost, which can reduce the overall beneficial effect of therapy.",0,0,0,0
480,"Nevertheless, the study findings indicating the safe and effective use of mesenchymal stem cell therapy in treating COVID-19 are indeed remarkable and present a modern approach to the efficient and safe treatment of critically ill patients.",0,0,0,0


In [30]:
# An example of filtering
sentsdf_filt = sentsdf.sort_values(by="Cardinality", ascending=False)
sentsdf_filt

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
349,"Although its efficiency as a treatment is still unknown, it is known to prevent the occurrence of diseaseFull size imageHistorical precedentsSerum antibodies have long been used in the treatment of many diseases caused by viruses, such as poliomyelitis [140], measles [141, 142], mumps [143], and influenza [144].",0,0,0,5
280,"The heterogeneity of observations reported, from no effect [121], to a reduction in the death rate [121, 122], to severe side effects and death [123] of ICU-admitted patients [124], prevents clear conclusions from being drawn.",2,0,0,5
187,"The reported efficacious dosage of chloroquine was more than 400 mg, and therefore, the dosage of hydroxychloroquine was set accordingly in another study [63], which found two dosages that gave a significant effect, first being an oral dose of more than 1000 mg on the first day, followed by less than 500 mg on each of the following days.",0,0,2,4
212,"[82], who showed that the administration of these two drugs failed to boost the action of INF-β against the virus, where the EC50 values showed the effect of the two drugs to be similar in the case of MERS-CoV and SARS-CoV-1.",0,1,1,3
219,"This was further substantiated by another study on more than 70 lopinavir/ritonavir-treated patients who exhibited an overall lower fatality rate than patients who did not receive the two drugs as an initial treatment, which was significant, as advantageous effects of lopinavir/ritonavir were only observed in those patients who received these drugs as an initial treatment [103].",0,0,0,3
...,...,...,...,...,...
298,"The innate antiviral response is triggered by interferon (α, β) and was investigated in SARS-CoV-2 and MERS-CoV infections.",0,0,0,0
299,"When interferon was used in combination with ribavirin, the mortality rate was not improved.",1,0,0,0
161,"If they fail to meet these criteria, suffer from organ failure, have renal and hepatic disease, or receive dialysis, they are exempted from compassionate use.",0,0,0,0
56,"The second stage comprises the communal dissemination of the virus, restricted mainly to dense neighborhoods and families.",0,0,0,0


In [31]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by='SKC',ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
48,"The characteristics of the infection and transmission are given below:In the early days of the outbreak, when cases were restricted only to China, the mean incubation period of SARS-CoV-2 was reported to be about 5 days [2].",3,2,3,1
60,The virus can spread by through direct transmission via respiratory droplets or by indirect transmission via surfaces and materials contaminated with the virus [8].,2,0,0,1
361,"In both of these viral outbreaks, high mortality and contagion of disease were observed, which made the spread hard to curb.",2,0,0,0
360,"The SARS epidemic was subdued with strict management, but the second epidemic of MERS soon spread from the Middle Eastern countries, followed by a second wave of infection in South Korea.",2,0,1,0
27,"New human CoVs arise through zoonosis, with the virus first being transmitted from animals to humans, who infect other humans via close contact.",2,0,0,0


In [39]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def context(idx, pm = 1, df = sentsdf):
    n = len(df)
    
    if (idx-pm) < 0 and (idx+pm) > n:
        return df
    elif (idx-pm) < 0:
        return df.loc[0:idx+pm,:]
    elif (idx-pm) > n:
        return df.loc[idx-pm:n,:]
    return df.loc[idx-pm:idx+pm,:]

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf_top):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf_top.head()):
    return sum(df.SKC) / len(df) * 10.000

In [33]:
calculateRelevance()

22.0

In [34]:
df1 = sentencesWith()
df1.loc[df1.Cardinality > 0]

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
48,"The characteristics of the infection and transmission are given below:In the early days of the outbreak, when cases were restricted only to China, the mean incubation period of SARS-CoV-2 was reported to be about 5 days [2].",3,2,3,1
60,The virus can spread by through direct transmission via respiratory droplets or by indirect transmission via surfaces and materials contaminated with the virus [8].,2,0,0,1
356,"Its use reduced mortality in infected patients compared to those undergoing routine treatment, as observed in a study conducted in Africa during the Ebola outbreak [147].",2,0,0,1
280,"The heterogeneity of observations reported, from no effect [121], to a reduction in the death rate [121, 122], to severe side effects and death [123] of ICU-admitted patients [124], prevents clear conclusions from being drawn.",2,0,0,5
208,"[98] reported the action of various antiviral drugs against SARS-CoV-1 during its outbreak, and the administration of lopinavir and ribavirin was observed to block replication of the virus after just two days of therapy, indicating heightened efficacy when used in combination [98].",1,0,1,2
...,...,...,...,...,...
234,"The groups did not differ in their time from the initial symptoms to their randomization, and no significant difference was reported for symptom improvement or fatality rate between the two groups.",0,1,0,1
137,The government of Canada is contributing a grant of CA $192 million for vaccine production and related research studies [46].,0,0,1,1
147,"For its use against SARS-CoV-2, the tested dosage for treatment was more than 150 mg, with intravenous administration on the first day, followed by reduction of the dosage by half for ten days.",0,0,3,1
192,"An example can be described to explain the point: If two equally capable compounds were taken and analyzed from the PBPK data for a dosage of hydroxychloroquine less than 1000 mg and varying regimens, RLTEC values would be produced that would be different on consecutive days, which would vary drastically from the dosage of chloroquine taken two times every day with RLTEC values.",0,3,3,2


In [47]:
import numpy as np

f_out = open("testOutput.txt", "w")

dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction",
    "incubation":"incubation",
    "r0":"reproduction",
}
statsdf = pd.DataFrame([],columns=['Parameter','Estimates','n','Citation','Rule'])
print(statsdf)

from spacy.matcher import Matcher
bigMatcher = Matcher(nlp.vocab)
smallMatcher = Matcher(nlp.vocab)
smallMatcher.add("n_rule1",None,[{'TEXT':'n'},{'TEXT':'='},{'LIKE_NUM':True}])
idx = None     

def find_n(doc):
    n_match = smallMatcher(doc)
    if n_match:
        match_id, start, end = n_match[0]
        return int(doc[end-1].text)
    return np.NaN
    
def funnel_values(estimates, parameter, rule_name, doc):
    global statsdf
    
    n = find_n(doc) #could make this faster
    # Parameter IS in dictionary
    if parameter:
        # Checking if estimates already included, in which case doesn't add a new row
        if not ((statsdf['Parameter'] == parameter) &\
             (statsdf['Estimates'] == estimates)).any():
            new_row = {'Parameter':parameter,\
                       'Estimates':estimates,\
                       'n':n,\
                       'Citation':idx,\
                       'Rule':rule_name}
            statsdf = statsdf.append(new_row,ignore_index=True)
         
    # Parameter is NOT in dictionary
    elif idx not in list(statsdf.Citation) and\
        estimates not in list(statsdf.Estimates.loc[statsdf.Citation == idx]):
        statsdf.loc[len(statsdf)] = [parameter, estimates, n, idx, rule_name]
        
def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map", doc)

        
def bw_map(matcher, doc, id, matches):
    for match_id, start, end in matches:  
        lower = round(float(doc[start].text),2)
        upper = round(float(doc[end-1].text),2)
        
        estimates = (lower,upper)
        parameter = dict_map.get(str(doc[0]))
        #f_out.write(doc.text + "(SRC: BW_MAP)")
        
        funnel_values(estimates, parameter, "bw_map", doc)

# days_map finds matches for "# days", then searches the rest of the sentences' tokens for specific keywords to
# store as the corresponding parameter.
def days_map(matcher, doc, id, matches):
    for match_id, start, end in matches: 
        est_token = doc[end-2]
        estimate = est_token.text
        parameter = "None"
        for token in doc:
            closestIdx = abs(0 - est_token.i)
            # specReg is the regex for specific keywords
            if re.search(specReg, token.text.lower()):
                proximity_to_est = abs(token.i - est_token.i)
                # find the closest keyword to our estimate, as long as its within an arbitrary range (i said 15)
                if (proximity_to_est < closestIdx) & (proximity_to_est < 15):
                    closestIdx = token.i
                    parameter = token.text.lower()
        funnel_values(estimate, parameter, "days_map", doc)
        
# and_map works like days map, but for ranges specified with "and"
def and_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        lower_est_token = doc[start]
        lower = round(float(doc[start].text),2)
        upper = round(float(doc[end-1].text),2)
        
        estimates = (lower,upper)
        parameter = "None"
        for token in doc:
            closestIdx = abs(0 - lower_est_token.i)
            if re.search(specReg, token.text.lower()):
                proximity_to_est = abs(token.i - lower_est_token.i)
                if (proximity_to_est < closestIdx) & (proximity_to_est < 15):
                    closestIdx = token.i
                    parameter = token.text.lower()
        f_out.write(parameter)
        funnel_values(estimates, dict_map.get(parameter), "and_map", doc)

Empty DataFrame
Columns: [Parameter, Estimates, n, Citation, Rule]
Index: []


In [48]:
pm_rule = [{"IS_ALPHA":True},{"IS_ALPHA":True},{"IS_ALPHA":True},\
           {"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]
bw_rule = [{"LIKE_NUM":True}, {"TEXT":"–"}, {"LIKE_NUM":True}]
days_rule1 = [{"IS_ALPHA":True}, {"IS_ALPHA":True}, {"LIKE_NUM":True}, {"TEXT":"days"}]
days_rule2 = [{"IS_ALPHA":True}, {"IS_ALPHA":True}, {"LIKE_NUM":True}, {"TEXT":"days,"}]
days_rule3 = [{"IS_ALPHA":True}, {"IS_ALPHA":True}, {"LIKE_NUM":True}, {"TEXT":"days."}]
and_rule = [{"LIKE_NUM":True}, {"TEXT":"and"}, {"LIKE_NUM":True}]

bigMatcher.add("pm_rule", pm_map, pm_rule)
bigMatcher.add("bw_rule", bw_map, bw_rule)
bigMatcher.add("and_rule", and_map, and_rule)
bigMatcher.add("days_rule1", days_map, days_rule1)
bigMatcher.add("days_rule2", days_map, days_rule2)
bigMatcher.add("days_rule3", days_map, days_rule2)

In [52]:
def cases_deaths_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        estimates = doc[start]
        parameter = doc[end-1]
        funnel_values(estimates, parameter, "cases_deaths_map", doc)
        
cases_deaths_rule1 = [{"LIKE_NUM":True},{"TEXT":"deaths"}]
cases_deaths_rule2 = [{"LIKE_NUM":True},{"TEXT":"cases"}]
cases_deaths_rule3 = [{"LIKE_NUM":True},{"TEXT":"fatalities"}]

bigMatcher.add("cases_deaths_rule1", cases_deaths_map, cases_deaths_rule1)
bigMatcher.add("cases_deaths_rule2", cases_deaths_map, cases_deaths_rule2)
bigMatcher.add("cases_deaths_rule3", cases_deaths_map, cases_deaths_rule3)

In [57]:
statsdf = statsdf.iloc[0:0]

sents_filt = sentsdf
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    doc = nlp(sentence)
    matches = bigMatcher(doc)

In [58]:
# Order the Parameters column variables so that "None" is ranked lowest (for readability)
from pandas.api.types import CategoricalDtype

statsparams = list(statsdf.Parameter)
# If our set of parameters has no duplicates (does not apply to sample abstract, which stores values across countries)
# We probably should figure out how to separate the data from a paper if it deals with multiple countries?
if len(statsparams) == len(set(statsparams)):
    if ("None" in statsparams):
        statsparams.append(statsparams.pop(statsparams.index("None")))
    statsdf["Parameter"].astype(CategoricalDtype(categories=statsparams, ordered=True))
    # Sorted stats dataframe:
    statsdf = statsdf.sort_values('Parameter', ascending=False)

statsdf

Unnamed: 0,Parameter,Estimates,n,Citation,Rule
0,fatalities,150000,,1,cases_deaths_map
1,incubation,5,,48,days_map
2,transmission,7,,49,days_map
3,reproduction,"(2.0, 8.0)",,50,and_map
4,,"(5.0, 12.0)",,52,and_map
5,,nine,,53,days_map
6,,fourteen,,79,days_map
7,,ten,,147,days_map
8,,two,,208,days_map
9,,13,,238,days_map


In [59]:
context(51)

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
50,The R0 value was found to be between 2 and 8.,0,1,1,1
51,The incubation period varied according to the severity of symptoms.,1,1,0,0
52,"Patients with mild symptoms reported the mean interval to be approximately 5 and 12 days from the beginning of infection to their first treatment and hospital care, respectively, and in patients with severe symptoms, the interval was 7 and 8 days, respectively [1].",0,1,2,1


In [60]:
context(50)

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
49,"The time on average for its exponential growth (and infection of other individuals) was observed to be about 7 days, while the mean interval for transmission from person to person was also about 7 days.",1,2,2,0
50,The R0 value was found to be between 2 and 8.,0,1,1,1
51,The incubation period varied according to the severity of symptoms.,1,1,0,0
