In [5]:
import spacy
from spacy.matcher import Matcher
import numpy as np
from spacy import displacy                                                 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import re

ModuleNotFoundError: No module named 'spacy'

In [4]:
# Loads NLP English model
nlp = spacy.load('en')

NameError: name 'spacy' is not defined

In [None]:
# Reads text
file = open('random_full.txt','r')
text = file.read()
text

In [None]:
# Convert text into NLP object
textdoc = nlp(text)

In [None]:
# Named Entity Recognition
#displacy.serve(doc, style='ent')

In [None]:
sents = list(str(sent) for sent in list(textdoc.sents))
sents

In [None]:
# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

In [None]:
# Listing keywords
general_numerical_keywords = ['time','number*','ratio','proportion','period','±','total*','estimate*','%']
specific_numerical_keywords = ['infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
trait_keywords = general_numerical_keywords + specific_numerical_keywords
trait_keywords_regex = '|'.join(trait_keywords)
trait_keywords_regex

In [None]:
'''
Parses a sentence, looking for trait-related keywords.

args: sentence - string of sentence.
return: trait_keyword_match_count - the number how many trait-related keywords the sentence contains.
'''
def countTraitKeywords(sentence):
    sentence_obj = nlp(sentence)
    trait_keyword_match_count = len(re.findall(trait_keywords_regex, str(sentence_obj)))
    return trait_keyword_match_count

'''
Parses a sentence, looking for context-related keywords.

args: sentence - string of sentence.
return: trait_keyword_match_count - the number how many context-related keywords the sentence contains.
'''
def countContextualKeywords(sentence):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    contextual_keyword_match_count = len([label for label in ent_labels if label in contextual_keywords])
    return contextual_keyword_match_count

'''
Parses a sentence, counting occurences of cardinal elements.

args: sentence - string of sentence.
return: numericness - how many numbers the sentence contains.
'''
def findCardinality(sentence):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    cardinality = len([label for label in ent_labels if label == 'CARDINAL'])
    return cardinality

In [None]:
sentsdf['TKMC'] = sentsdf.Sentence.map(countTraitKeywords)
sentsdf['CKMC'] = sentsdf.Sentence.map(countContextualKeywords)
sentsdf['SKMC'] = sentsdf['TKMC'] + sentsdf['CKMC']
sentsdf['Cardinality'] = sentsdf.Sentence.map(findCardinality)

# My attempt at guessing how many 'numeric' each sentence is
sentsdf['Numericness'] = sentsdf.TKMC * sentsdf.Cardinality

In [None]:
sentsdf

In [None]:
# An example of filtering
sentsdf_filt = sentsdf.loc[(sentsdf.SKMC >= 3) & (sentsdf.Cardinality >=2)]
print(len(sentsdf_filt))
sentsdf_filt

In [None]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by=['TKMC'],ascending=False)
sentsdf_top.head()

In [None]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def displaySurroundingText(idx, df = sentsdf, pm = 1):
    n = len(df)
    if (idx-pm) < 0 and (idx+pm) > n:
        display(df)
        return
    elif (idx-pm) < 0:
        display(df.loc[0:idx+pm,:])
        return
    elif (idx-pm) > n:
        display(df.loc[idx-pm:n,:])
        return
    display(df.loc[idx-pm:idx+pm,:])
    return

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf_top):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf_top.head()):
    return sum(df.TKMC) / len(df) * 10.000

In [None]:
calculateRelevance()

In [None]:
df1 = sentencesWith()
df1.loc[df1.Cardinality > 0]

In [None]:
dict_map = {
    "infectious":"infectious", 
    "contact":"contact",
    "latency":"latency",
    "latent":"latency",
    "reproduction":"reproduction"
}
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

In [None]:
matcher = Matcher(nlp.vocab)
idx = None
        

def funnel_values(estimates, parameter, rule_name):
    global stats_df
    if parameter:
        stats_df.at[parameter,'Estimates'] = estimates
        stats_df.at[parameter,'Citation'] = idx
        stats_df.at[parameter,'Rule'] = rule_name
    elif idx not in list(stats_df.Citation) and\
        estimates not in list(stats_df.Estimates.loc[stats_df.Citation == idx]) :
        stats_df.loc[len(stats_df)] = [estimates, idx, rule_name]

def pm_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split()
        
        avg = round(float(split_span[-3]),2)
        moe = round(float(split_span[-1]),2)
        
        estimates = (avg-moe,avg+moe)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "pm_map")

        
def bw_map(matcher, doc, id, matches):
    for match_id, start, end in matches:
        string = str(doc[start:end])
        split_span = string.split('–')
        
        lower = round(float(split_span[0]),2)
        upper = round(float(split_span[1]),2)
        
        estimates = (lower,upper)
        parameter = dict_map.get(split_span[0])
        
        funnel_values(estimates, parameter, "bw_map")

In [None]:
pm_rule = [{"LIKE_NUM":True}, {"TEXT":"±"}, {"LIKE_NUM":True}]
bw_rule = [{"LIKE_NUM":True}, {"TEXT":"–"}, {"LIKE_NUM":True}]

matcher.add("pm_rule", pm_map, pm_rule)
matcher.add("bw_rule", bw_map, bw_rule)

In [None]:
stats_df = pd.DataFrame(index = set(dict_map.values()), columns=['Estimates','Citation','Rule'])
stats_df.index.name = 'Parameter'
stats_df

sents_filt = sentencesWith(specific_numerical_keywords)
for idx in sents_filt.index:
    sentence = sents_filt.Sentence[idx]
    sentence = sentence.replace("–"," – ")
    doc = nlp(sentence)
    matches = matcher(doc)
stats_df

In [None]:
displaySurroundingText(207)

In [None]:
senten