In [2]:
import spacy    
import unicodedata
import re

In [3]:
# Loads NLP English model
nlp = spacy.load('en')

In [5]:
processed = False
paper = 'data/texts/"hepatitis E virus"/10.14218/JCTH.2015.00009.txt' # oliver test paper
#paper = 'random_full.txt'

# Reads text
if 1:
    with open(paper, 'r') as file:
        text = file.read().replace('\n', ' ')
else:
    text = "In Fig. 2 and Tables 3 (available online only), 4 (available online only), 5, we reprint the estimated CFR for each Ebola outbreak (by virus) and for Marburg virus. The Ebola Zaire virus is the most lethal with an overall estimated CFR ranging from 69 to 88%2,5,25,38,43,49,50 (Table 3 (available online only)). The CFR of outbreaks due to Ebola Sudan virus ranged from 53 to 69%1,24,51–53 (Table 4 (available online only)), and the CFR of outbreaks due to Ebola Bundibugyo ranged from 34 to 42%19,46,47 (Table 5). For the ongoing outbreak in West Africa due to Ebola Zaire, the estimated CFR, as measured among confirmed and probable cases with definitive outcome (recovered or fatal), is approximately 70%, and varies little among the three most affected countries (Guinea, Liberia and Sierra Leone; Table 6 (available online only) and Data Citation 2)38. The CFR among EVD cases reported by Nigeria (n=20) was 40%54). A second, unrelated EVD outbreak occurred in Équateur province, DRC between July and October 2014 resulting in 69 confirmed and probable cases with a CFR of 74%49. The CFR for Marburg is approximately 80%55–57)."
    
text = unicodedata.normalize("NFKD", text) # replaces \xa0 with " "
text = re.sub(r'(?<=[.]) (?=[^\s])', r' ', text) # spaces out concatenated sentences
text

'Current Knowledge on Hepatitis E https://dx.doi.org/10.14218/JCTH.2015.00009  Abstract  Although only a single serotype of hepatitis E virus (HEV), the causative agent of hepatitis E, has been identified, there is great genetic variation among the different HEV isolates reported. There are at least four major recognized genotypes of HEV: genotypes 1 and 2 are mainly restricted to humans and linked to epidemic outbreaks in nonindustrialized countries, whereas genotypes 3 and 4 are zoonotic in both developing and industrialized countries. Besides human strains, genotype 3 and 4 strains of HEV have been genetically characterized from swine, sika deer, mongooses, sheep, and rabbits. Currently, there are approximately 11,000 human and animal sequences of HEV available at the International Nucleotide Sequence Database Collaboration. HEV is the major cause of waterborne outbreaks of hepatitis in areas of poor sanitation. Additionally, it is responsible for sporadic cases of viral hepatitis i

In [6]:
def processText(text):
    global processed
    if not processed:
        processed = True
        text = text.replace('%','% ').replace('–',' – ').replace('-',' – ').replace('=',' = ').replace(',',', ')
        text = re.sub("[\[].*?[\]]", "", text)
        string1_protected = re.sub(r"(\d)\.(\d)", r"\1[PROTECTED_DOT]\2", text)  
        # now split (and remove empty lines)
        lines_protected = [line + "." for line in string1_protected.split(".") if line]   
        # now re-replace all "[PROTECTED_DOT]"s
        lines = [line.replace("[PROTECTED_DOT]", ".") for line in lines_protected]
        text = ' '.join(lines)
        text = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text)
        text = re.sub(' +', ' ',text)
        return text
    else:
        print("Already processed")
        return text
text = processText(text)
text

'Current Knowledge on Hepatitis E https://dx. doi. org/10.14218/JCTH. 2015.00009 Abstract Although only a single serotype of hepatitis E virus (HEV), the causative agent of hepatitis E, has been identified, there is great genetic variation among the different HEV isolates reported. There are at least four major recognized genotypes of HEV: genotypes 1 and 2 are mainly restricted to humans and linked to epidemic outbreaks in nonindustrialized countries, whereas genotypes 3 and 4 are zoonotic in both developing and industrialized countries. Besides human strains, genotype 3 and 4 strains of HEV have been genetically characterized from swine, sika deer, mongooses, sheep, and rabbits. Currently, there are approximately 11, 000 human and animal sequences of HEV available at the International Nucleotide Sequence Database Collaboration. HEV is the major cause of waterborne outbreaks of hepatitis in areas of poor sanitation. Additionally, it is responsible for sporadic cases of viral hepatitis

In [7]:
# Convert text into NLP object
textdoc = nlp(text)

In [8]:
# Named Entity Recognition
from spacy import displacy 
displacy.render(textdoc, style='ent')

In [9]:
sents = list(str(sent) for sent in list(textdoc.sents))

import pandas as pd
pd.set_option('display.max_colwidth', None)

# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,Current Knowledge on Hepatitis E https://dx.
1,doi.
2,org/10.14218/JCTH.
3,2015.00009 Abstract
4,"Although only a single serotype of hepatitis E virus (HEV), the causative agent of hepatitis E, has been identified, there is great genetic variation among the different HEV isolates reported."
...,...
305,HEV mainly causes large outbreaks of acute hepatitis in endemic areas and sporadic cases in industrialized countries.
306,"Although most infections are subclinical, the clinical symptoms may range from acute hepatitis, chronic infection in immunosuppressed and transplant patients to severe acute failure in pregnant women."
307,"Transmission may occur via water, food and blood."
308,Laboratory diagnosis relies on serological assays and testing for HEV RNA in blood.


In [10]:
# Listing keywords
numeric_keywords = ['distribution*','time','number*','ratio','proportion','period','±','total*','estimate*','%','review','parameter*','mean','period','value']
specific_keywords = ['cases','fatalities','n =','CFR','case-fatality','r0','reproduct*','infect*','infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*','epideme*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']
statistical_keywords = ['fatalities', 'deaths', 'cases']

# Creating a regular expression using keywords for searching and filtering 
numeric_regex = '|'.join(numeric_keywords)
specific_regex = '|'.join(specific_keywords)
statistical_regex = '|'.join(specific_keywords)

trait_keywords = numeric_keywords + specific_keywords
trait_regex = '|'.join(trait_keywords)
trait_regex

numReg = re.compile(numeric_regex)
specReg = re.compile(specific_regex)
traitReg = re.compile(trait_regex)
statReg = re.compile(statistical_regex)

In [11]:
'''
Parses a sentence, looking for keywords.

args: sentence - string of sentence.
return: specific_keyword_count - total number of specific keywords in sentence
        numeric_keyword_count - total number of numeric keywords in sentence
        contextual_keyword_count - total number of contextual entity labels in sentence
        cardinality - total number of numeric entity labels in sentence
'''
def countKeywords(sentence):
    specific_keyword_count = numeric_keyword_count = contextual_keyword_count = cardinality = 0
    
    # counts specific and numeric keywords first
    for word in sentence.split():
        if re.match(specific_regex, word): specific_keyword_count += 1
        elif re.match(numeric_regex, word): numeric_keyword_count += 1
    
    # counts key entity labels
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    for label in ent_labels:
        if label in contextual_keywords: contextual_keyword_count += 1
        elif label in ['CARDINAL']: cardinality += 1
        
    return specific_keyword_count, numeric_keyword_count, contextual_keyword_count, cardinality

In [12]:
sentsdf['SKC'],sentsdf['NKC'],sentsdf['CKC'],sentsdf['Cardinality'] = zip(*sentsdf['Sentence'].apply(countKeywords))
sentsdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
0,Current Knowledge on Hepatitis E https://dx.,0,0,0,0
1,doi.,0,0,0,0
2,org/10.14218/JCTH.,0,0,0,0
3,2015.00009 Abstract,0,0,0,1
4,"Although only a single serotype of hepatitis E virus (HEV), the causative agent of hepatitis E, has been identified, there is great genetic variation among the different HEV isolates reported.",0,0,0,0
...,...,...,...,...,...
305,HEV mainly causes large outbreaks of acute hepatitis in endemic areas and sporadic cases in industrialized countries.,2,0,0,0
306,"Although most infections are subclinical, the clinical symptoms may range from acute hepatitis, chronic infection in immunosuppressed and transplant patients to severe acute failure in pregnant women.",2,0,0,0
307,"Transmission may occur via water, food and blood.",0,0,0,0
308,Laboratory diagnosis relies on serological assays and testing for HEV RNA in blood.,0,0,0,0


In [13]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def context(idx, pm = 1, df = sentsdf):
    n = len(df)
    
    if (idx-pm) < 0 and (idx+pm) > n:
        return df
    elif (idx-pm) < 0:
        return df.loc[0:idx+pm,:]
    elif (idx-pm) > n:
        return df.loc[idx-pm:n,:]
    return df.loc[idx-pm:idx+pm,:]

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words, df=sentsdf):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

In [14]:
import numpy as np

f_out = open("testOutput.txt", "w")

dict_map = {
    # Infectious:
    "infectious":"infectious",
    "infection":"infectious",
    # Contact:
    "contact":"contact",
    # Incubation:
    "incubation":"incubation",
    # Latency:
    "latency":"latency",
    "latent":"latency",
    # Reproduction:
    "reproductive":"reproduction",
    "reproduction":"reproduction",
    "r0":"reproduction",
    # Cases/Deaths
    "fatalities":"deaths",
    "deaths":"deaths",
    "cases":"cases",
    # Transmission
    "transmission":"transmission",
    "transmi*":"transmission",
    # Case-fatality rate:
    "case - fatality":"CFR",
    "case-fatality":"CFR",
    "CFR":"CFR"
}

statsdf = pd.DataFrame([],columns=['Parameter','Estimates','n','Citation','Rule'])
print(statsdf)

from spacy.matcher import Matcher
bigMatcher = Matcher(nlp.vocab)
smallMatcher = Matcher(nlp.vocab)
smallMatcher.add("n_rule",None,[{'TEXT':'n'},{'TEXT':'='},{'LIKE_NUM':True}])
idx = None
error_indices = set()

def find_n(doc):
    n_match = smallMatcher(doc)
    if n_match:
        match_id, start, end = n_match[0]
        return int(doc[end-1].text)
    return np.NaN

# takes an NLP doc, a token of the matched estimate, a regex pattern produced by re.compile, and whether or not
# we search bidirectionally (backwards & forwards) or not (just backwards)
def parameterSearch(doc, est_token, regex, bidirectional = False):
    parameter = "None"
    # We start with the first token (index 0):
    closestIdx = abs(0 - est_token.i)
    for token in doc:
        # If the token comes before the estimate and matches a specific keyword:
        if (re.search(regex, token.text.lower())):
            if (((not bidirectional) & (token.i <= est_token.i)) or bidirectional):
                proximity_to_est = abs(token.i - est_token.i)
                # find the closest keyword to our estimate, as long as its within an arbitrary range (i said 15)
                if (proximity_to_est < closestIdx) & (proximity_to_est < 15):
                    closestIdx = token.i
                    parameter = token.text.lower()        
    return parameter

def parameterSearchPrevSentence(doc, regex, idx):
    parameter = "None"
    if (idx > 0):
        prevSent = sents_filt.loc[idx - 1].Sentence
        prevdoc = nlp(prevSent)
        closestIdx = 0
        for token in prevdoc:
            if (re.search(regex, token.text.lower())):
                    if (token.i > closestIdx):
                        closestIdx = token.i
                        parameter = token.text.lower()
    return parameter
    
def funnel_values(estimates, parameter, rule_name, doc):
    global statsdf
    
    n = find_n(doc) #could make this faster
    # Parameter IS in dictionary
    if parameter:
        # Checking if estimates already included, in which case doesn't add a new row
        if not ((statsdf['Parameter'] == parameter) &\
             (statsdf['Estimates'] == estimates)).any():
            new_row = {'Parameter':parameter,\
                       'Estimates':estimates,\
                       'n':n,\
                       'Citation':idx,\
                       'Rule':rule_name}
            statsdf = statsdf.append(new_row,ignore_index=True)
         
    # Parameter is NOT in dictionary
    elif not (idx in list(statsdf.Citation) and\
        estimates in list(statsdf.Estimates.loc[statsdf.Citation == idx])):
        statsdf.loc[len(statsdf)] = [parameter, estimates, n, idx, rule_name]
        
def pm_map(matcher, doc, id, matches):
    global error_indices
    
    for match_id, start, end in matches:
        try:
            string = str(doc[start:end])
            split_span = string.split()

            # Get the rule
            rule = nlp.vocab.strings[match_id]

            # Get our leftmost estimate token for search:
            avg_token = doc[end-3]

            # Compute interval
            moe = round(float(doc[end-1].text),2)
            avg = round(float(avg_token.text),2)
            estimates = (avg-moe,avg+moe)

            # Search for corresponding parameter
            parameter = parameterSearch(doc, avg_token, specReg)

            # Failed? Try searching last sentence:
            if (parameter == "None"):
                parameter = parameterSearchPrevSentence(doc, specReg, idx)
                rule = rule + " (from prev sentence)"

            funnel_values(estimates, dict_map.get(parameter), rule, doc)
        except:
            error_indices.add(idx)
            print("Error at",idx,"with pm_map")

def num_map(matcher, doc, id, matches):
    global error_indices
    
    for match_id, start, end in matches: 
        try:
            rule = nlp.vocab.strings[match_id]
            currSent = doc.text
            sentIdx = sents_filt.index[sents_filt.Sentence == currSent]
            est_token = doc[end-2]
            estimate = est_token.text

            # Search for corresponding parameter:
            parameter = parameterSearch(doc, est_token, specReg)

            # Failed? Try searching last sentence:
            if (parameter == "None"):
                parameter = parameterSearchPrevSentence(doc, specReg, idx)
                rule = rule + " (from prev sentence)"

            funnel_values(estimate, dict_map.get(parameter), rule, doc)
        except:
            error_indices.add(idx)
            print("Error at",idx,"with num_map")
    
def range_map(matcher, doc, id, matches):
    global error_indices
    
    for match_id, start, end in matches:
        try:
            rule = nlp.vocab.strings[match_id]
            # Find leftmost token (for searching):
            lower_est_token = doc[start]

            # Compute interval:
            lower = round(float(lower_est_token.text),2)
            upper = round(float(doc[end-1].text),2)
            estimates = (lower,upper)

            # Search for corresponding parameter:
            parameter = parameterSearch(doc, lower_est_token, specReg)

            # Failed? Try searching last sentence:
            if (parameter == "None"):
                parameter = parameterSearchPrevSentence(doc, specReg, idx)
                rule = rule + " (from prev sentence)"

            funnel_values(estimates, dict_map.get(parameter), rule, doc)
        except:
            error_indices.add(idx)
            print("Error at",idx,"with range_map")
        
def cases_deaths_map(matcher, doc, id, matches):
    global error_indices
    
    for match_id, start, end in matches:
        try:
            estimates = int(float(doc[start].text.replace(",","")))
            parameter = doc[end-1].text.replace("fatalities","deaths")
            funnel_values(estimates, parameter, "cases_deaths_map", doc)
        except:
            error_indices.add(idx)
            print("Error at",idx,"with cases_deaths_map")
        
def r0_map(matcher, doc, id, matches):
    global error_indices
    
    for match_id, start, end in matches:
        try:
            funnel_values(round(float(doc[end-1].text),2),"reproduction","r0_map",doc)
        except:
            error_indices.add(idx)
            print("Error at",idx,"with r0_map")

Empty DataFrame
Columns: [Parameter, Estimates, n, Citation, Rule]
Index: []


In [15]:
# Plus/Minus rule:
pm_rule = [{"IS_ALPHA":True},{"IS_ALPHA":True},{"IS_ALPHA":True}, {"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]

# Range rules:
range_rule1 = [{"IS_DIGIT":True}, {"TEXT":"and"}, {"IS_DIGIT":True}]
range_rule2 = [{"IS_DIGIT":True}, {"TEXT":"–"}, {"IS_DIGIT":True}]
range_rule3 = [{"IS_DIGIT":True}, {"TEXT":"to"}, {"IS_DIGIT":True}]

# Exact number rule:
num_rule1 = [{"IS_ALPHA":True},{"IS_ALPHA":True},{"LIKE_NUM":True},{"TEXT":{"REGEX":"day*|%"}}]
num_rule2 = [{"TEXT":{"REGEX":"was|is"}},{"LIKE_NUM":True},{"TEXT":{"REGEX":"day*|%"}}]

# Reproductive number (R0) rule:
r0_rule = [{"LOWER":"r0"},{"TEXT":"="},{"LIKE_NUM":True}]

# Cases/Deaths Rule:
case_deaths_rule = [{"LIKE_NUM":True},{"TEXT":{"REGEX":"fatalities|deaths|cases"}}]


# Add each the rules to our matcher:
bigMatcher.add("pm_rule", pm_map, pm_rule)

bigMatcher.add("range_rule1", range_map, range_rule1)
bigMatcher.add("range_rule2", range_map, range_rule2)
bigMatcher.add("range_rule2", range_map, range_rule3)

bigMatcher.add("num_rule1", num_map, num_rule1)
bigMatcher.add("num_rule2", num_map, num_rule2)

bigMatcher.add("cases_deaths_rule", cases_deaths_map, case_deaths_rule)

bigMatcher.add("r0_map",r0_map,r0_rule)

In [16]:
statsdf = statsdf.iloc[0:0]

sents_filt = sentsdf
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    doc = nlp(sentence)
    matches = bigMatcher(doc)

None_indices =  [i for i, x in enumerate(list(statsdf.Parameter)) if x == None]
non_None_indices = list(set(list(statsdf.index))^set(None_indices))

statsdf_without_None = statsdf.iloc[non_None_indices]
statsdf_with_None = statsdf.iloc[None_indices]

statsdf = pd.concat([statsdf_without_None, statsdf_with_None])
pd.set_option("display.max_rows", None)
from IPython.core.display import display
display(statsdf)

Error at 272 with range_map
Error at 294 with cases_deaths_map


Unnamed: 0,Parameter,Estimates,n,Citation,Rule
4,infectious,25,,13,num_rule1
13,transmission,"(1.0, 2.0)",,79,range_rule1 (from prev sentence)
14,transmission,"(3.0, 4.0)",,81,range_rule1 (from prev sentence)
15,infectious,"(1.0, 2.0)",,89,range_rule1 (from prev sentence)
16,infectious,"(3.0, 4.0)",,89,range_rule1 (from prev sentence)
21,infectious,"(64.0, 66.0)",,128,range_rule2 (from prev sentence)
22,incubation,40,,140,num_rule1
23,incubation,"(2.0, 3.0)",,141,range_rule2 (from prev sentence)
24,cases,"(1.0, 4.0)",,147,range_rule2 (from prev sentence)
30,deaths,"(7.0, 13.0)",,197,range_rule2


In [251]:
mostSKCdf = sentsdf.sort_values(by="SKC", ascending=False).head(20)
matchesdf = sentsdf.iloc[list(set(statsdf.Citation).union(error_indices))].sort_index(inplace=False)
relevantdf_indices = list(set(mostSKCdf.index)-set(matchesdf.index))
relevantdf = pd.concat([matchesdf, sentsdf.iloc[relevantdf_indices].sort_index(inplace=False)])
relevantdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
3,"org/10.1007/s13337 – 020 – 00632 – 9 Abstract Coronaviruses are single stranded RNA viruses usually present in bats (reservoir hosts), and are generally lethal, highly transmissible, and",1,0,1,2
7,About 8096 cases and 774 deaths were reported worldwide with the SARS – CoV infection during year 2002; 2229 cases and 791 deaths were reported for the MERS – CoV that emerged during 2012.,5,0,3,3
8,"Recently ~ 33, 849, 737 cases and 1, 012, 742 deaths (data as on 30 Sep 2020) were reported from the recent evolver SARS – CoV – 2 infection.",3,0,2,6
38,SARS – CoV genome has shown 87 – 92% sequence homology to the SARS like coronavirus isolated from the bats through anal swabs and the one isolated from civets with a 29 – nucleotide signature site in their genome.,0,0,0,2
41,"Further, CoV virus isolated from the palm civets and raccoon dogs has shown 99.8% sequence homology with the human CoV viruses.",0,0,0,0
69,The non – structural proteins encoded by ORF 1b (protein 12 – 16) is comparatively less expressed than the proteins encoded by ORF 1a (protein 1 – 11).,0,0,0,5
72,"5′ exonuclease with a clear proofreading activity , protein 15 with unclear endo – ribonuclease activity , protein 7 and 8, which acts as activating co – factors of polymerase, protein 10, acts as 2′ O – methyltransferase , protein 3, 4, and 6, are three transmembrane proteins that acts as anchoring protein of replicative transcription complex to the membrane , protein 13, a conserved region helps in unwinding the RNA from 5′ – 3′ direction , and protein 9, unique to coronavirus that protects the genome from degradation during replication.",0,0,2,11
109,"Similarly, stated that though 99% nucleotide homology was shared between the 8 strains isolated from South Korea with Riyadh strain, about 13 variant nucleotides were observed in 24 – 27 nucleotides position across the genome namely 6 variations in ORF1ab gene, 5 in S gene, and 1 each in ORF 4a and ORF 5 evidencing the micro –",0,0,3,8
127,Gly (residue position 482 – 485) has made the ridge more compact enabling better contact with the N – terminal helix of human ACE2.,1,0,0,2
141,Genome of coronavirus exhibits several intrinsic point mutations in the order of about 1 × 10− 6 per site in the RNA – dependent – RNA – polymerase gene and possess a property of acquiring 100 – 1000 base – pair fragments from other coronaviruses during co – infection in single species.,1,0,0,4
