In [1]:
import spacy
from spacy.matcher import Matcher
import numpy as np
from spacy import displacy                                                 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import re

  


In [2]:
# Loads NLP English model
nlp = spacy.load('en')

In [3]:
# Reads text
file = open('random_full.txt','r')
text = file.read()
text

"Since January 2020 Elsevier has created a COVID-19 resource centre with\nfree information in English and Mandarin on the novel coronavirus COVID19. The COVID-19 resource centre is hosted on Elsevier Connect, the\ncompany's public news and information website.\nElsevier hereby grants permission to make all its COVID-19-related\nresearch that is available on the COVID-19 resource centre - including this\nresearch content - immediately available in PubMed Central and other\npublicly funded repositories, such as the WHO COVID database with rights\nfor unrestricted research re-use and analyses in any form or by any means\nwith acknowledgement of the original source. These permissions are\ngranted for free by Elsevier for as long as the COVID-19 resource centre\nremains active.\nShort Communication\nThe assessment of transmission efficiency and latent infection\nperiod in asymptomatic carriers of SARS-CoV-2 infection\nZhirong Liua,c,1\n, Ruilin Chub,1\n, Lei Gonga,c,1\n, Bin Sua,c\n, Jiabin

In [29]:
# Convert text into NLP object
textdoc = nlp(text)

In [115]:
# Named Entity Recognition
#displacy.serve(textdoc, style='ent')

In [114]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents[:5]

['Since January 2020 Elsevier has created a COVID-19 resource centre with\nfree information in English and Mandarin on the novel coronavirus COVID19.',
 "The COVID-19 resource centre is hosted on Elsevier Connect, the\ncompany's public news and information website.\n",
 'Elsevier hereby grants permission to make all its COVID-19-related\nresearch that is available on the COVID-19 resource centre - including this\nresearch content - immediately available in PubMed Central and other\npublicly funded repositories, such as the WHO COVID database with rights\nfor unrestricted research re-use and analyses in any form or by any means\nwith acknowledgement of the original source.',
 'These permissions are\ngranted for free by Elsevier for as long as the COVID-19 resource centre\nremains active.\n',
 'Short Communication\n']

In [117]:
# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,Since January 2020 Elsevier has created a COVID-19 resource centre with\nfree information in English and Mandarin on the novel coronavirus COVID19.
1,"The COVID-19 resource centre is hosted on Elsevier Connect, the\ncompany's public news and information website.\n"
2,"Elsevier hereby grants permission to make all its COVID-19-related\nresearch that is available on the COVID-19 resource centre - including this\nresearch content - immediately available in PubMed Central and other\npublicly funded repositories, such as the WHO COVID database with rights\nfor unrestricted research re-use and analyses in any form or by any means\nwith acknowledgement of the original source."
3,These permissions are\ngranted for free by Elsevier for as long as the COVID-19 resource centre\nremains active.\n
4,Short Communication\n
...,...
221,"Assessment of latent infectivity, based on 16 confirmed cases who had transferred from being asymptomatic carriers.\n"
222,Z. Liu et al.
223,/
224,International Journal of Infectious Diseases 99 (2020


In [119]:
# Listing keywords
numeric_keywords = ['time','number*','ratio','proportion','period','±','total*','estimate*','%']
specific_keywords = ['infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
numeric_regex = '|'.join(numeric_keywords)
specific_regex = '|'.join(numeric_keywords)

trait_keywords = numeric_keywords + specific_keywords
trait_regex = '|'.join(trait_keywords)
trait_regex

'time|number*|ratio|proportion|period|±|total*|estimate*|%|infections|death*|transmis*|laten*|contact|infectious|incubat*|casualties|mortal*|morbid*|outbreak*'

In [111]:
'''
Parses a sentence, looking for keywords.

args: sentence - string of sentence.
return: specific_keyword_count - total number of specific keywords in sentence
        numeric_keyword_count - total number of numeric keywords in sentence
        contextual_keyword_count - total number of contextual entity labels in sentence
        cardinality - total number of numeric entity labels in sentence
'''
def countKeywords(sentence):
    specific_keyword_count = numeric_keyword_count = contextual_keyword_count = cardinality = 0
    
    # counts specific and numeric keywords first
    for word in sentence.split():
        if re.match(specific_regex, word): specific_keyword_count += 1
        elif re.match(numeric_regex, word): numeric_keyword_count += 1
    
    # counts key entity labels
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    for label in ent_labels:
        if label in contextual_keywords: context_keyword_count += 1
        elif label in ['CARDINAL']: cardinality += 1
        
    return specific_keyword_count, numeric_keyword_count, contextual_keyword_count, cardinality

In [103]:
sentsdf['SKC'],  sentsdf['NKC'], sentsdf['CKC'], sentsdf['Cardinality'] = zip(*sentsdf['Sentence'].apply(countKeywords))
sentsdf

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
0,Since January 2020 Elsevier has created a COVID-19 resource centre with\nfree information in English and Mandarin on the novel coronavirus COVID19.,0,0,1,0
1,"The COVID-19 resource centre is hosted on Elsevier Connect, the\ncompany's public news and information website.\n",0,0,1,0
2,"Elsevier hereby grants permission to make all its COVID-19-related\nresearch that is available on the COVID-19 resource centre - including this\nresearch content - immediately available in PubMed Central and other\npublicly funded repositories, such as the WHO COVID database with rights\nfor unrestricted research re-use and analyses in any form or by any means\nwith acknowledgement of the original source.",0,0,0,0
3,These permissions are\ngranted for free by Elsevier for as long as the COVID-19 resource centre\nremains active.\n,0,0,0,0
4,Short Communication\n,0,0,0,0
...,...,...,...,...,...
221,"Assessment of latent infectivity, based on 16 confirmed cases who had transferred from being asymptomatic carriers.\n",0,0,0,1
222,Z. Liu et al.,0,0,0,0
223,/,0,0,0,0
224,International Journal of Infectious Diseases 99 (2020,0,0,0,1


In [104]:
# An example of filtering
sentsdf_filt = sentsdf.sort_values(by="Cardinality", ascending=False)
sentsdf_filt

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
98,"Two cases had tested positive 5 days before developing\nsymptoms, one case 4 days before, three cases 3 days before, seven\ncases 2 days before, and three cases just 1 day prior to becoming ill.\n",0,0,5,5
134,) 123 (93.9) 16 (100.0)\n,0,0,0,4
104,"The 16 confirmed cases who\nhad previously been asymptomatic accounted for 236 close\ncontacts, with a second attack rate of 9.7%, while the remaining\n131 asymptomatic carriers accounted for 914 close contacts, with a\nsecond attack rate of 2.6%.",0,0,0,4
123,(30.6) 41 (31.3) 4 (25.0)\n,0,0,1,4
125,(19.7) 24 (18.3) 5 (31.2)\n,0,0,1,4
...,...,...,...,...,...
39,Human-to-human\ntransmission of SARS,0,0,0,0
37,"(Li et al., 2020; WHO, 2020;\nZhu et al., 2019).",0,0,3,0
116,Age 0.195\n,0,0,0,0
36,"The outbreak originated in Wuhan city, Hubei\nProvince in December 2019, and quickly spread to provinces and\ncities across the country and aboard",0,0,3,0


In [109]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by='SKC',ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,SKC,NKC,CKC,Cardinality
28,"Our study indicated that COVID-19 cases are\ncontagious during the incubation period, and that close contact screening should be extended to include\nthe incubation period.",2,0,0,0
26,"The possible latent infection period was found to range from 1–5 days\nbefore onset, with a median time of 2 days.",2,0,2,0
94,A total of 16 asymptomatic carriers developed\nsymptoms during the following 14-day observation period.\n,2,0,1,1
76,"clinical symptoms, including fever, chill, cough, and fatigue\nif these developed during the 14-day observation period; (3)\nlaboratory testing related to SARS-CoV-2 infection,including nucleic\ntest result and timing of test; (4) close contact information, such as\nnumbers, confirmed cases, and asymptomatic carriers.\n",2,0,1,2
169,Commission Emergency Research Project of Novel Coronavirus\nInfection (grant numbers 202004a07020002; 202004a07020004).\n,1,0,0,0


In [14]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def displaySurroundingText(idx, df = sentsdf, pm = 1):
    n = len(df)
    if (idx-pm) < 0 and (idx+pm) > n:
        display(df)
        return
    elif (idx-pm) < 0:
        display(df.loc[0:idx+pm,:])
        return
    elif (idx-pm) > n:
        display(df.loc[idx-pm:n,:])
        return
    display(df.loc[idx-pm:idx+pm,:])
    return

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf_top):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf_top.head()):
    return sum(df.TKMC) / len(df) * 10.000

In [15]:
calculateRelevance()

42.0

In [16]:
df1 = sentencesWith()
df1.loc[df1.Cardinality > 0]

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Cardinality,Numericness
76,"clinical symptoms, including fever, chill, cough, and fatigue\nif these developed during the 14-day observation period; (3)\nlaboratory testing related to SARS-CoV-2 infection,including nucleic\ntest result and timing of test; (4) close contact information, such as\nnumbers, confirmed cases, and asymptomatic carriers.\n",4,1,5,2,8
104,"The 16 confirmed cases who\nhad previously been asymptomatic accounted for 236 close\ncontacts, with a second attack rate of 9.7%, while the remaining\n131 asymptomatic carriers accounted for 914 close contacts, with a\nsecond attack rate of 2.6%.",4,0,4,4,16
27,"The second attack rate for the 16 confirmed cases who had\ntransferred from being asymptomatic carriers was 9.7% (23/236 close contacts), while for the 131\nasymptomatic carriers the rate was 2.6% (24/914 close contacts), showing a significant difference in\nsecond attack rate between the two groups (p＜0.001).",4,0,4,3,12
94,A total of 16 asymptomatic carriers developed\nsymptoms during the following 14-day observation period.\n,2,1,3,1,2
103,47 close contacts tested positive for SARS-CoV-2 infection\nduring the 14-day observation period.,2,1,3,1,2
102,A total of 1150 close\ncontacts was determined in relation to the 147 asymptomatic\ncarriers.,2,0,2,2,4
99,The median period was calculated as 2 days (range 1–5).\n,2,1,3,1,2
97,The latent infectivity period was evaluated using the 16\nconfirmed cases who had transferred from being asymptomatic\ncarriers.,2,0,2,1,2
217,Clinical characteristics of 24\nasymptomatic infections with COVID-19 screened among close contacts in\nChina.,2,1,3,1,2
96,The results of our latent infectivity assessment are shown in\nFigure 1.,1,0,1,1,1


In [17]:
dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction"
}
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
infectious,,,
reproduction,,,
latency,,,
contact,,,


In [18]:
matcher = Matcher(nlp.vocab)
idx = None
        

def funnel_values(estimates, parameter, rule_name):
    global statsdf
    if parameter:
        statsdf.at[parameter,'Estimates'] = estimates
        statsdf.at[parameter,'Citation'] = idx
        statsdf.at[parameter,'Rule'] = rule_name
    elif idx not in list(statsdf.Citation) and\
        estimates not in list(statsdf.Estimates.loc[stats_df.Citation == idx]) :
        statsdf.loc[len(statsdf)] = [estimates, idx, rule_name]

def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map")

        
def bw_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split('–')
        
        lower = round(float(split_span[0]),2)
        upper = round(float(split_span[1]),2)
        
        estimates = (lower,upper)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "bw_map")

In [19]:
pm_rule = [{"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]
bw_rule = [{"LIKE_NUM":True}, {"TEXT":"–"}, {"LIKE_NUM":True}]

matcher.add("pm_rule", pm_map, pm_rule)
matcher.add("bw_rule", bw_map, bw_rule)

In [20]:
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

sents_filt = sentencesWith(specific_numerical_keywords)
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    sentence = sentence.replace("–"," – ")
    doc = nlp(sentence)
    matches = matcher(doc)
statsdf

Unnamed: 0_level_0,Estimates,Citation,Rule
Parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
infectious,,,
reproduction,,,
latency,,,
contact,,,
4,"(1.0, 5.0)",26.0,bw_map
5,"(1.0, 5.0)",158.0,bw_map
6,"(1.0, 5.0)",99.0,bw_map


In [91]:
if re.match(trait_regex,"tim"): print(1)

In [96]:
x = nlp('US is Canada')
x

US is Canada

In [97]:
x.ents

(US, Canada)