In [143]:
import spacy    
import unicodedata
import re

In [144]:
# Loads NLP English model
nlp = spacy.load('en')

In [145]:
paper = "data/texts/ebola/10.1007/s00705-020-04768-3.txt" # oliver test paper
#paper = 'random_full.txt'

# Reads text
if 1:
    with open(paper, 'r') as file:
        text = unicodedata.normalize("NFKD", file.read().replace('\n', ' ')) # replaces \xa0 with " "
        text = re.sub(r'(?<=[.])(?=[^\s])', r' ', text) # spaces out concatenated sentences
else:
    text = "In Fig. 2 and Tables 3 (available online only), 4 (available online only), 5, we reprint the estimated CFR for each Ebola outbreak (by virus) and for Marburg virus. The Ebola Zaire virus is the most lethal with an overall estimated CFR ranging from 69 to 88%2,5,25,38,43,49,50 (Table 3 (available online only)). The CFR of outbreaks due to Ebola Sudan virus ranged from 53 to 69%1,24,51–53 (Table 4 (available online only)), and the CFR of outbreaks due to Ebola Bundibugyo ranged from 34 to 42%19,46,47 (Table 5). For the ongoing outbreak in West Africa due to Ebola Zaire, the estimated CFR, as measured among confirmed and probable cases with definitive outcome (recovered or fatal), is approximately 70%, and varies little among the three most affected countries (Guinea, Liberia and Sierra Leone; Table 6 (available online only) and Data Citation 2)38. The CFR among EVD cases reported by Nigeria (n=20) was 40%54). A second, unrelated EVD outbreak occurred in Équateur province, DRC between July and October 2014 resulting in 69 confirmed and probable cases with a CFR of 74%49. The CFR for Marburg is approximately 80%55–57)."
    
processed = False

In [146]:
def processText(text):
    global processed
    if not processed:
        processed = True
        return text.replace('%','% ').replace('-',' – ')
text = processText(text)
text

"Potential strategies for combating COVID – 19  Abstract Coronavirus disease 2019, also known as COVID – 19, is caused by a novel coronavirus named severe acute respiratory syndrome coronavirus 2, or SARS – CoV – 2. The infection has now catapulted into a full – blown pandemic across the world, which has affected more than 2 million people and has led to approximately 150,000 fatalities all over the world (WHO). In this review, we elaborate all currently available data that shed light on possible methods for treatment of COVID – 19, such as antiviral drugs, corticosteroids, convalescent plasma, and potentially effective vaccines. Additionally, ongoing and discontinued clinical trials that have been carried out for validating probable treatments for COVID – 19 are discussed. The review also elaborates the prospective approach and the possible advantages of using convalescent plasma and stem cells for the improvement of clinical symptoms and meeting the demand for an instantaneous cure. 

In [147]:
# Convert text into NLP object
textdoc = nlp(text)

In [148]:
# Named Entity Recognition
from spacy import displacy 
displacy.render(textdoc, style='ent')

In [149]:
sents = list(str(sent) for sent in list(textdoc.sents))

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,"Potential strategies for combating COVID – 19 Abstract Coronavirus disease 2019, also known as COVID – 19, is caused by a novel coronavirus named severe acute respiratory syndrome coronavirus 2, or SARS – CoV – 2."
1,"The infection has now catapulted into a full – blown pandemic across the world, which has affected more than 2 million people and has led to approximately 150,000 fatalities all over the world (WHO)."
2,"In this review, we elaborate all currently available data that shed light on possible methods for treatment of COVID – 19, such as antiviral drugs, corticosteroids, convalescent plasma, and potentially effective vaccines."
3,"Additionally, ongoing and discontinued clinical trials that have been carried out for validating probable treatments for COVID – 19 are discussed."
4,The review also elaborates the prospective approach and the possible advantages of using convalescent plasma and stem cells for the improvement of clinical symptoms and meeting the demand for an instantaneous cure.
...,...
482,"Although extensive research is being conducted to develop a suitable vaccine against COVID – 19, there is a dire need to shift a major part of the ongoing research towards the treatment of pneumonia in patients, which is often fatal."
483,The astounding data regarding mesenchymal stem cells offers a hopeful approach to the use of an endogenous pathway for the treatment of disease.
484,"In the case of pneumonia caused by COVID – 19, the balance between the antiviral response and the regulation of LIF action against a cytokine storm may be lost, which can reduce the overall beneficial effect of therapy."
485,"Nevertheless, the study findings indicating the safe and effective use of mesenchymal stem cell therapy in treating COVID – 19 are indeed remarkable and present a modern approach to the efficient and safe treatment of critically ill patients."


In [150]:
# Listing keywords
numeric_keywords = ['distribution*','time','number*','ratio','proportion','period','±','total*','estimate*','%','review','parameter*','mean','period','value']
specific_keywords = ['CFR','case-fatality','r0','reproduct*','infect*','infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*','epideme*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']
statistical_keywords = ['fatalities', 'deaths', 'cases']

# Creating a regular expression using keywords for searching and filtering 
numeric_regex = '|'.join(numeric_keywords)
specific_regex = '|'.join(specific_keywords)
statistical_regex = '|'.join(specific_keywords)

trait_keywords = numeric_keywords + specific_keywords
trait_regex = '|'.join(trait_keywords)
trait_regex

numReg = re.compile(numeric_regex)
specReg = re.compile(specific_regex)
traitReg = re.compile(trait_regex)
statReg = re.compile(statistical_regex)

In [151]:
import re

'''
Parses a sentence, looking for keywords.

args: sentence - string of sentence.
return: specific_keyword_count - total number of specific keywords in sentence
        numeric_keyword_count - total number of numeric keywords in sentence
        contextual_keyword_count - total number of contextual entity labels in sentence
        cardinality - total number of numeric entity labels in sentence
'''
def countKeywords(sentence):
    specific_keyword_count = numeric_keyword_count = contextual_keyword_count = cardinality = 0
    
    # counts specific and numeric keywords first
    for word in sentence.split():
        if re.match(specific_regex, word): specific_keyword_count += 1
        elif re.match(numeric_regex, word): numeric_keyword_count += 1
    
    # counts key entity labels
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    for label in ent_labels:
        if label in contextual_keywords: contextual_keyword_count += 1
        elif label in ['CARDINAL']: cardinality += 1
        
    return specific_keyword_count, numeric_keyword_count, contextual_keyword_count, cardinality

In [152]:
sentsdf['SKC'],  sentsdf['NKC'], sentsdf['CKC'], sentsdf['Cardinality'] = zip(*sentsdf['Sentence'].apply(countKeywords))
sentsdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
0,"Potential strategies for combating COVID – 19 Abstract Coronavirus disease 2019, also known as COVID – 19, is caused by a novel coronavirus named severe acute respiratory syndrome coronavirus 2, or SARS – CoV – 2.",0,0,1,4
1,"The infection has now catapulted into a full – blown pandemic across the world, which has affected more than 2 million people and has led to approximately 150,000 fatalities all over the world (WHO).",1,0,0,2
2,"In this review, we elaborate all currently available data that shed light on possible methods for treatment of COVID – 19, such as antiviral drugs, corticosteroids, convalescent plasma, and potentially effective vaccines.",0,1,0,1
3,"Additionally, ongoing and discontinued clinical trials that have been carried out for validating probable treatments for COVID – 19 are discussed.",0,0,0,1
4,The review also elaborates the prospective approach and the possible advantages of using convalescent plasma and stem cells for the improvement of clinical symptoms and meeting the demand for an instantaneous cure.,0,1,0,0
...,...,...,...,...,...
482,"Although extensive research is being conducted to develop a suitable vaccine against COVID – 19, there is a dire need to shift a major part of the ongoing research towards the treatment of pneumonia in patients, which is often fatal.",0,0,0,1
483,The astounding data regarding mesenchymal stem cells offers a hopeful approach to the use of an endogenous pathway for the treatment of disease.,0,0,0,0
484,"In the case of pneumonia caused by COVID – 19, the balance between the antiviral response and the regulation of LIF action against a cytokine storm may be lost, which can reduce the overall beneficial effect of therapy.",0,0,1,0
485,"Nevertheless, the study findings indicating the safe and effective use of mesenchymal stem cell therapy in treating COVID – 19 are indeed remarkable and present a modern approach to the efficient and safe treatment of critically ill patients.",0,0,0,1


In [153]:
# An example of filtering
sentsdf_filt = sentsdf.sort_values(by="Cardinality", ascending=False)
sentsdf_filt

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
279,"The heterogeneity of observations reported, from no effect [121], to a reduction in the death rate [121, 122], to severe side effects and death [123] of ICU – admitted patients [124], prevents clear conclusions from being drawn.",2,0,0,5
349,"Although its efficiency as a treatment is still unknown, it is known to prevent the occurrence of diseaseFull size imageHistorical precedentsSerum antibodies have long been used in the treatment of many diseases caused by viruses, such as poliomyelitis [140], measles [141, 142], mumps [143], and influenza [144].",0,0,0,5
0,"Potential strategies for combating COVID – 19 Abstract Coronavirus disease 2019, also known as COVID – 19, is caused by a novel coronavirus named severe acute respiratory syndrome coronavirus 2, or SARS – CoV – 2.",0,0,1,4
104,"In a WHO project named “Solidarity Trial”, the four most potent existing antiviral drugs were tested against COVID – 19 in ten different countries, and the results were reported [16].",0,0,0,4
157,"Nonetheless, it is important to note that the treatment of three out of twelve COVID – 19 patients with remdesivir in the US was carried out under the protocol for compassionate use [91] due to the lack of sufficient data to evaluate its safety or efficacy.",0,0,1,4
...,...,...,...,...,...
91,"To date, no approved antiviral medicine has been reported.",0,0,0,0
289,"A recent consensus statement from the Chinese Thoracic Society recommends a lower dose, ≤ 0.",0,0,0,0
291,"Pharmacological treatments in which risks outweigh benefitsRibavirin with or without interferonRibavirin was approved in the 1980s for viral hemorrhagic fever, respiratory syncytial virus infection, and hepatitis C, together with interferon.",1,0,1,0
292,It is a guanosine analogue known for interfering with transcription by inhibiting the synthesis of RNA.,0,0,0,0


In [154]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by='SKC',ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
49,"The characteristics of the infection and transmission are given below:In the early days of the outbreak, when cases were restricted only to China, the mean incubation period of SARS – CoV – 2 was reported to be about 5 days [2].",4,2,3,2
29,"New human CoVs arise through zoonosis, with the virus first being transmitted from animals to humans, who infect other humans via close contact.",3,0,0,0
360,"The SARS epidemic was subdued with strict management, but the second epidemic of MERS soon spread from the Middle Eastern countries, followed by a second wave of infection in South Korea.",3,0,1,0
356,"Its use reduced mortality in infected patients compared to those undergoing routine treatment, as observed in a study conducted in Africa during the Ebola outbreak [147].",3,0,0,1
417,Mesenchymal stem cell technology Clinicians are aiming to save the lives of COVID – 19 patients by controlling the rate of infection and death [168].,2,0,0,2


In [155]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def context(idx, pm = 1, df = sentsdf):
    n = len(df)
    
    if (idx-pm) < 0 and (idx+pm) > n:
        return df
    elif (idx-pm) < 0:
        return df.loc[0:idx+pm,:]
    elif (idx-pm) > n:
        return df.loc[idx-pm:n,:]
    return df.loc[idx-pm:idx+pm,:]

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words, df=sentsdf):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf_top.head()):
    return sum(df.SKC) / len(df) * 10.000

In [156]:
calculateRelevance()

30.0

In [157]:
import numpy as np

f_out = open("testOutput.txt", "w")

dict_map = {
    # Infectious:
    "infectious":"infectious",
    "infection":"infectious",
    # Contact:
    "contact":"contact",
    # Incubation:
    "incubation":"incubation",
    # Latency:
    "latency":"latency",
    "latent":"latency",
    # Reproduction:
    "reproductive":"reproduction",
    "reproduction":"reproduction",
    "r0":"reproduction",
    # Cases/Deaths
    "fatalities":"deaths",
    "deaths":"deaths",
    "cases":"cases",
    # Transmission
    "transmission":"transmission",
    "transmi*":"transmission",
    # Case-fatality rate:
    "case-fatality":"CFR",
    "CFR":"CFR"
}

statsdf = pd.DataFrame([],columns=['Parameter','Estimates','n','Citation','Rule'])
print(statsdf)

from spacy.matcher import Matcher
bigMatcher = Matcher(nlp.vocab)
smallMatcher = Matcher(nlp.vocab)
smallMatcher.add("n_rule",None,[{'TEXT':'n'},{'TEXT':'='},{'LIKE_NUM':True}])
idx = None     

def find_n(doc):
    n_match = smallMatcher(doc)
    if n_match:
        match_id, start, end = n_match[0]
        return int(doc[end-1].text)
    return np.NaN

# takes an NLP doc, a token of the matched estimate, a regex pattern produced by re.compile, and whether or not
# we search bidirectionally (backwards & forwards) or not (just backwards)
def proximitySearch(doc, est_token, regex, bidirectional = False):
    parameter = "None"
    # We start with the first token (index 0):
    closestIdx = abs(0 - est_token.i)
    for token in doc:
        # If the token comes before the estimate and matches a specific keyword:
        if (re.search(regex, token.text.lower())):
            if (((not bidirectional) & (token.i <= est_token.i)) or bidirectional):
                proximity_to_est = abs(token.i - est_token.i)
                # find the closest keyword to our estimate, as long as its within an arbitrary range (i said 15)
                if (proximity_to_est < closestIdx) & (proximity_to_est < 15):
                    closestIdx = token.i
                    parameter = token.text.lower()        
    return parameter

def proximitySearchPrevSentence(doc, regex, idx):
    parameter = "None"
    if (idx > 0):
        prevSent = sents_filt.loc[idx - 1].Sentence
        prevdoc = nlp(prevSent)
        closestIdx = 0
        for token in prevdoc:
            if (re.search(regex, token.text.lower())):
                    if (token.i > closestIdx):
                        closestIdx = token.i
                        parameter = token.text.lower()
    return parameter
    
def funnel_values(estimates, parameter, rule_name, doc):
    global statsdf
    
    n = find_n(doc) #could make this faster
    # Parameter IS in dictionary
    if parameter:
        # Checking if estimates already included, in which case doesn't add a new row
        if not ((statsdf['Parameter'] == parameter) &\
             (statsdf['Estimates'] == estimates)).any():
            new_row = {'Parameter':parameter,\
                       'Estimates':estimates,\
                       'n':n,\
                       'Citation':idx,\
                       'Rule':rule_name}
            statsdf = statsdf.append(new_row,ignore_index=True)
         
    # Parameter is NOT in dictionary
    elif not (idx in list(statsdf.Citation) and\
        estimates in list(statsdf.Estimates.loc[statsdf.Citation == idx])):
        statsdf.loc[len(statsdf)] = [parameter, estimates, n, idx, rule_name]
        
def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        # Get the rule
        rule = nlp.vocab.strings[match_id]
        
        # Get our leftmost estimate token for search:
        avg_token = doc[end-3]
        
        # Compute interval
        moe = round(float(doc[end-1].text),2)
        avg = round(float(avg_token.text),2)
        estimates = (avg-moe,avg+moe)
        
        # Search for corresponding parameter
        parameter = proximitySearch(doc, avg_token, specReg)
        
        # Failed? Try searching last sentence:
        if (parameter == "None"):
            parameter = proximitySearchPrevSentence(doc, specReg, idx)
            rule = rule + " (prev sentence)"
        
        funnel_values(estimates, dict_map.get(parameter), rule, doc)

        
def bw_map(matcher, doc, id, matches):
    for match_id, start, end in matches:  
        # Get the rule
        rule = nlp.vocab.strings[match_id]
        
        # Get our leftmost estimate token for search:
        lower_token = doc[start]
        
        # Compute interval:
        lower = round(float(lower_token.text),2)
        upper = round(float(doc[end-1].text),2)
        estimates = (lower,upper)
        
        # Search for corresponding parameter:
        parameter = proximitySearch(doc, lower_token, specReg)
        
        # Failed? Try searching last sentence:
        if (parameter == "None"):
            parameter = proximitySearchPrevSentence(doc, specReg, idx)
            rule = rule + " (prev sentence)"
        
        funnel_values(estimates, dict_map.get(parameter), "bw_map", doc)

# days_map finds matches for "# days", then searches the rest of the sentences' tokens for specific keywords to
# store as the corresponding parameter.
def num_map(matcher, doc, id, matches):
    for match_id, start, end in matches: 
        rule = nlp.vocab.strings[match_id]
        currSent = doc.text
        sentIdx = sents_filt.index[sents_filt.Sentence == currSent]
        est_token = doc[end-2]
        estimate = est_token.text
        
        # Search for corresponding parameter:
        parameter = proximitySearch(doc, est_token, specReg)
        
        # Failed? Try searching last sentence:
        if (parameter == "None"):
            parameter = proximitySearchPrevSentence(doc, specReg, idx)
            rule = rule + " (prev sentence)"
        
        funnel_values(estimate, dict_map.get(parameter), rule, doc)
        
# and_map works like days map, but for ranges specified with "and"
def and_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        rule = nlp.vocab.strings[match_id]
        # Find leftmost token (for searching):
        lower_est_token = doc[start]
        
        # Compute interval:
        lower = round(float(lower_est_token.text),2)
        upper = round(float(doc[end-1].text),2)
        estimates = (lower,upper)
        
        # Search for corresponding parameter:
        parameter = proximitySearch(doc, lower_est_token, specReg)
        
        # Failed? Try searching last sentence:
        if (parameter == "None"):
            parameter = proximitySearchPrevSentence(doc, specReg, idx)
            rule = rule + " (prev sentence)"
            
        funnel_values(estimates, dict_map.get(parameter), rule, doc)

Empty DataFrame
Columns: [Parameter, Estimates, n, Citation, Rule]
Index: []


In [158]:
pm_rule = [{"IS_ALPHA":True},{"IS_ALPHA":True},{"IS_ALPHA":True},\
           {"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]
num_rule = [{"IS_ALPHA":True}, {"IS_ALPHA":True}, {"LIKE_NUM":True}, {"TEXT":{"REGEX":"day*|%"}}]

bigMatcher.add("pm_rule", pm_map, pm_rule)
bigMatcher.add("and_rule", and_map,\
               [{"IS_DIGIT":True}, {"TEXT":"and"}, {"IS_DIGIT":True}],\
               [{"IS_DIGIT":True}, {"TEXT":"–"}, {"IS_DIGIT":True}],\
               [{"IS_DIGIT":True}, {"TEXT":"to"}, {"IS_DIGIT":True}])
bigMatcher.add("num_rule", num_map, num_rule)

In [159]:
def cases_deaths_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        estimates = int(doc[start].text.replace(",",""))
        parameter = doc[end-1].text.replace("fatalities","deaths")
        funnel_values(estimates, parameter, "cases_deaths_map", doc)
        
bigMatcher.add("cases_deaths_rule", cases_deaths_map, 
               [{"LIKE_NUM":True},{"TEXT":{"REGEX":"fatalities|deaths|cases"}}])

In [None]:
statsdf = statsdf.iloc[0:0]

sents_filt = sentsdf
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    doc = nlp(sentence)
    matches = bigMatcher(doc)

In [None]:
# Order the Parameters column variables so that "None" is ranked lowest (for readability)
from pandas.api.types import CategoricalDtype

statsparams = list(statsdf.Parameter)
# If our set of parameters has no duplicates (does not apply to sample abstract, which stores values across countries)
# We probably should figure out how to separate the data from a paper if it deals with multiple countries?
if len(statsparams) == len(set(statsparams)):
    if ("None" in statsparams):
        statsparams.append(statsparams.pop(statsparams.index("None")))
    statsdf["Parameter"].astype(CategoricalDtype(categories=statsparams, ordered=True))
    # Sorted stats dataframe:
    statsdf = statsdf.sort_values('Parameter', ascending=False)

display(statsdf)
sentsdf.iloc[list(set(statsdf.Citation))]