In [1]:
import spacy
from termcolor import colored
from spacy import displacy                                                 
import pandas as pd
import re
import fnmatch
import pandas as pd
pd.set_option('display.max_colwidth', -1)

  


In [2]:
# Loads NLP model
nlp = spacy.load('en')

In [3]:
# Reads text
file = open('random.txt','r')
text = file.read()
text

'Abstract\nBackground\nBoth vector borne and sexual transmission of Zika virus (ZIKV) involve infection of epithelial cells in the initial stages of infection. Epithelial cells are unique in their ability to form polarized monolayers and their barrier function. Cell polarity induces an asymmetry in the epithelial monolayer, which is maintained by tight junctions and specialized sorting machinery. This differential localization can have a potential impact of virus infection. Asymmetrical distribution of a viral receptor can restrict virus entry to a particular membrane while polarized sorting can lead to a directional release of virions. The present study examined the impact of cell polarity on ZIKV infection and release.\n\nMethods\nA polarized Caco-2 cell model we described previously was used to assess ZIKV infection. Transepithelial resistance (TEER) was used to assess epithelial cell polarity, and virus infection was measured by immunofluorescence microscopy and qRT-PCR. Cell perme

In [None]:
# Convert text into NLP object
doc = nlp(text)

In [21]:
# Named Entity Recognition
displacy.serve(doc, style='ent')




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [5]:
sents = list(str(sent) for sent in list(doc.sents))
sents

['Abstract\nBackground\n',
 'Both vector borne and sexual transmission of Zika virus (ZIKV)',
 'involve infection of epithelial cells in the initial stages of infection.',
 'Epithelial cells are unique in their ability to form polarized monolayers and their barrier function.',
 'Cell polarity induces an asymmetry in the epithelial monolayer, which is maintained by tight junctions and specialized sorting machinery.',
 'This differential localization can have a potential impact of virus infection.',
 'Asymmetrical distribution of a viral receptor can restrict virus entry to a particular membrane while polarized sorting can lead to a directional release of virions.',
 'The present study examined the impact of cell polarity on ZIKV infection and release.\n\n',
 'Methods\n',
 'A polarized Caco-2 cell model we described previously was used to assess ZIKV infection.',
 'Transepithelial resistance (TEER) was used to assess epithelial cell polarity, and virus infection was measured by immunoflu

In [6]:
# Put all sentences in the dataframe
sentsdf = pd.DataFrame(sents, columns = ['Sentence'])
sentsdf

Unnamed: 0,Sentence
0,Abstract\nBackground\n
1,Both vector borne and sexual transmission of Zika virus (ZIKV)
2,involve infection of epithelial cells in the initial stages of infection.
3,Epithelial cells are unique in their ability to form polarized monolayers and their barrier function.
4,"Cell polarity induces an asymmetry in the epithelial monolayer, which is maintained by tight junctions and specialized sorting machinery."
...,...
221,Conclusions\n
222,Our data show that polarized epithelial cells are susceptible to ZIKV infection.
223,The virus enters preferentially through the apical side and buds selectively through the basolateral membrane.
224,"Data from permeability assays and electron microscopy indicate that the virus is actually translocating transcellularly rather than paracellular manner, and ZIKV does not need disruption of TJ proteins to cross the epithelial barrier."


In [7]:
# Listing keywords
general_numerical_keywords = ['time','number*','ratio','proportion','period','±','total*','estimate*','%']
specific_numerical_keywords = ['infections','death*','transmis*','laten*','contact','infectious','incubat*','casualties','mortal*','morbid*','outbreak*']
contextual_keywords = ['GPE','DATE','TIME','PRODUCT']

# Creating a regular expression using keywords for searching and filtering 
trait_keywords = general_numerical_keywords + specific_numerical_keywords
trait_keywords_regex = '|'.join(trait_keywords)
trait_keywords_regex

'time|number*|ratio|proportion|period|±|total*|estimate*|%|infections|death*|transmis*|laten*|contact|infectious|incubat*|casualties|mortal*|morbid*|outbreak*'

In [8]:
'''
Parses a sentence, looking for trait-related keywords.

args: sentence - string of sentence.
return: trait_keyword_match_count - the number how many trait-related keywords the sentence contains.
'''
def countTraitKeywords(sentence):
    sentence_obj = nlp(sentence)
    trait_keyword_match_count = len(re.findall(trait_keywords_regex, str(sentence_obj)))
    return trait_keyword_match_count

'''
Parses a sentence, looking for context-related keywords.

args: sentence - string of sentence.
return: trait_keyword_match_count - the number how many context-related keywords the sentence contains.
'''
def countContextualKeywords(sentence):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    contextual_keyword_match_count = len([label for label in ent_labels if label in contextual_keywords])
    return contextual_keyword_match_count

'''
Parses a sentence, counting occurences of cardinal elements.

args: sentence - string of sentence.
return: numericness - how many numbers the sentence contains.
'''
def calculateNumericness(sentence):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents]
    numericness = len([label for label in ent_labels if label == 'CARDINAL'])
    return numericness

In [9]:
sentsdf['TKMC'] = sentsdf.Sentence.map(countTraitKeywords) # Trait Keyword Match Count

In [10]:
sentsdf['CKMC'] = sentsdf.Sentence.map(countContextualKeywords) #Contextual Keyword Match Count

In [11]:
sentsdf['SKMC'] = sentsdf.TKMC + sentsdf.CKMC # Sum total of above two

In [12]:
sentsdf['Numericness'] = sentsdf.Sentence.map(calculateNumericness) # Counts how many times the CARDINAL label shows up

In [13]:
sentsdf['Relevance'] = sentsdf.TKMC * sentsdf.Numericness # My attempt at guessing how many 'numeric' is sentence
# Don't confuse this with the function below. They have nothing to do with each other.
# I just couldn't think of names.

In [14]:
sentsdf

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Numericness,Relevance
0,Abstract\nBackground\n,0,0,0,0,0
1,Both vector borne and sexual transmission of Zika virus (ZIKV),1,0,1,0,0
2,involve infection of epithelial cells in the initial stages of infection.,0,0,0,0,0
3,Epithelial cells are unique in their ability to form polarized monolayers and their barrier function.,0,0,0,0,0
4,"Cell polarity induces an asymmetry in the epithelial monolayer, which is maintained by tight junctions and specialized sorting machinery.",0,0,0,0,0
...,...,...,...,...,...,...
221,Conclusions\n,0,0,0,0,0
222,Our data show that polarized epithelial cells are susceptible to ZIKV infection.,0,0,0,0,0
223,The virus enters preferentially through the apical side and buds selectively through the basolateral membrane.,1,0,1,0,0
224,"Data from permeability assays and electron microscopy indicate that the virus is actually translocating transcellularly rather than paracellular manner, and ZIKV does not need disruption of TJ proteins to cross the epithelial barrier.",0,0,0,0,0


In [15]:
# An example of filtering
sentsdf_filt = sentsdf.loc[(sentsdf.SKMC >= 3) & (sentsdf.Numericness >=2)]
print(len(sentsdf_filt))
sentsdf_filt

4


Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Numericness,Relevance
21,"While mosquito-borne transmission is the most common, other routes of transmission, including sexual transmission, have been reported [5, 6].",3,0,3,2,6
58,"Cells were equilibriated with IMF buffer (20 mM HEPES, pH 7.5, 0.1% Triton-X-100, 150 mM sodium chloride, 5 mM EDTA and 0.02% sodium azide as a preservative) for 5 min at room temperature (RT) followed by overnight incubation with anti-E-cadherin or mouse polyclonal sera against ZIKV at 4",3,1,4,3,9
76,"50 μl of ZIKV suspension at a concentration of 3 pfu/ cell was added either apically or basolaterally, and incubated for 1 h at 37 °C, and washed three times with PBS, followed by addition of 2% fetal bovine serum medium and incubation at 37 °C.",6,1,7,3,18
79,"Polarized Caco-2 cells (Day 6 post seeding) were infected with ZIKV-PR (3 pfu/ cell) either apically or basolaterally, and incubated for 1 h at 37 °C, and washed three times with PBS, followed by addition of 2% fetal bovine serum medium and incubation at 37 °C.",5,1,6,3,15


In [16]:
# Sentences with most relevance
sentsdf_top = sentsdf.sort_values(by=['TKMC'],ascending=False)
sentsdf_top.head()

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Numericness,Relevance
76,"50 μl of ZIKV suspension at a concentration of 3 pfu/ cell was added either apically or basolaterally, and incubated for 1 h at 37 °C, and washed three times with PBS, followed by addition of 2% fetal bovine serum medium and incubation at 37 °C.",6,1,7,3,18
79,"Polarized Caco-2 cells (Day 6 post seeding) were infected with ZIKV-PR (3 pfu/ cell) either apically or basolaterally, and incubated for 1 h at 37 °C, and washed three times with PBS, followed by addition of 2% fetal bovine serum medium and incubation at 37 °C.",5,1,6,3,15
140,ZIKV egress occurs through the basolateral route\nViruses which bud apically tend to cause localized infections while basolaterally budding viruses are more likely to cause systemic infections [35].,4,0,4,1,4
84,"The ZIKV supernatant was added onto the cells and incubated for 1 h at 37 °C, followed by removal of inoculum and addition of 2% fetal bovine serum medium and incubated at 37 °C.",3,0,3,1,3
58,"Cells were equilibriated with IMF buffer (20 mM HEPES, pH 7.5, 0.1% Triton-X-100, 150 mM sodium chloride, 5 mM EDTA and 0.02% sodium azide as a preservative) for 5 min at room temperature (RT) followed by overnight incubation with anti-E-cadherin or mouse polyclonal sera against ZIKV at 4",3,1,4,3,9


In [22]:
'''
Displays the text surrounding the sentence with the provided index. To refer to the context, perhaps.
args: idx - index of sentence in question.
    df = Dataframe, default is sentsdf.
    pm = plus or minus for the indices of surrounding sentences.
'''
def displaySurroundingText(idx, df = sentsdf, pm = 5):
    n = len(df)
    if (idx-pm) < 0 and (idx+pm) > n:
        display(df)
        return
    elif (idx-pm) < 0:
        display(df.loc[0:idx+pm,:])
        return
    elif (idx-pm) > n:
        display(df.loc[idx-pm:n,:])
        return
    display(df.loc[idx-pm:idx+pm,:])
    return

from collections import Counter

'''
Print entity labels and their occurences within a given sentence.
args: sentence - The sentence in question.
    from_keywords - Whether or not to only include info on labels in the contextual keywords. Does not by default.
return: list containing tuples of entity labels and their occurences.
'''
def printEntityLabels(sentence, from_keywords = False):
    sentence_obj = nlp(sentence)
    ent_labels = [ent.label_ for ent in sentence_obj.ents if (not from_keywords or ent.label_ in contextual_keywords)]
    labels = Counter(ent_labels).keys()
    counts = Counter(ent_labels).values()
    return list(zip(labels, counts))

'''
Prints all sentences from Dataframe with provided keywords.
args: filter_words: Words to filter results by. By default, the trait keywords.
    df - The Dataframe to filter. By default, sentsdf.
return: Returns Dataframe with only sentences including keywords from the list.
'''
def sentencesWith(filter_words=trait_keywords, df=sentsdf):
    if isinstance(filter_words, str):
        filter_regex = filter_words
    elif isinstance(filter_words, list):
        filter_regex = '|'.join(filter_words)
    return df[df.Sentence.str.lower().str.contains(filter_regex)]

'''
Calculates the relevance of a given dataframe, based on matches in the trait keyword  list. 
Used to determine relevance of the entire article.
args: df - The dataframe. By default, sentsdf.
return: The numeric approximation of the relevance of the provided dataframe.
'''
def calculateRelevance(df = sentsdf):
    return sum(df.TKMC) / len(df) * 10.000

In [32]:
calculateRelevance(sentsdf_top.head())

42.0

In [29]:
sentencesWith(trait_keywords).sort_values(by='TKMC',ascending=False)

Unnamed: 0,Sentence,TKMC,CKMC,SKMC,Numericness,Relevance
76,"50 μl of ZIKV suspension at a concentration of 3 pfu/ cell was added either apically or basolaterally, and incubated for 1 h at 37 °C, and washed three times with PBS, followed by addition of 2% fetal bovine serum medium and incubation at 37 °C.",6,1,7,3,18
79,"Polarized Caco-2 cells (Day 6 post seeding) were infected with ZIKV-PR (3 pfu/ cell) either apically or basolaterally, and incubated for 1 h at 37 °C, and washed three times with PBS, followed by addition of 2% fetal bovine serum medium and incubation at 37 °C.",5,1,6,3,15
140,ZIKV egress occurs through the basolateral route\nViruses which bud apically tend to cause localized infections while basolaterally budding viruses are more likely to cause systemic infections [35].,4,0,4,1,4
84,"The ZIKV supernatant was added onto the cells and incubated for 1 h at 37 °C, followed by removal of inoculum and addition of 2% fetal bovine serum medium and incubated at 37 °C.",3,0,3,1,3
21,"While mosquito-borne transmission is the most common, other routes of transmission, including sexual transmission, have been reported [5, 6].",3,0,3,2,6
...,...,...,...,...,...,...
100,Caco-2 cells seeded for 6 days showing around 100 Ω resistance were infected with ZIKV-PR at a concentration of 3pfu/ cell.,1,2,3,1,1
106,"a Polarized Caco-2 cells were infected with 3 pfu/cell ZIKV-PR, and analyzed for cellular ZIKV RNA levels by qRT-PCR at indicated times (b)",1,1,2,1,1
109,c Caco-2 monolayers were infected apically with ZIKV and fixed in 10% neutral buffered formalin solution and stained for E-cadherin (red) and ZIKV (green).,1,1,2,0,0
12,Statistical significance was calculated using one-way ANOVA and significance was set at p < 0.05.\n\n,1,1,2,1,1
