## Data Set

In [2]:
import pandas as pd
import glob
import time
import networkx as nx
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [3]:
path = '/Volumes/GoogleDrive/My Drive/_My Data Analytics Exercise/Exercise/Biopython/CE/'
frame = pd.read_csv(path+'all_interactions.csv')

In [4]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1124 entries, 0 to 1123
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   interactionId  1124 non-null   object
 1   sentence       1124 non-null   object
 2   entities       1124 non-null   object
 3   interaction    1124 non-null   object
 4   relationship   1124 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 44.0+ KB


In [6]:
# the positive relationship is based on the what indicated in the original data set.
round(frame['relationship'].value_counts()/1124*100,2)

1    73.4
0    26.6
Name: relationship, dtype: float64

## Exploration

In [7]:
have_relationship = frame[frame['relationship']==1]
have_no_relationship = frame[frame['relationship']==0]

In [8]:
have_relationship.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 825 entries, 1 to 1122
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   interactionId  825 non-null    object
 1   sentence       825 non-null    object
 2   entities       825 non-null    object
 3   interaction    825 non-null    object
 4   relationship   825 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 38.7+ KB


In [9]:
have_relationship.head()

Unnamed: 0,interactionId,sentence,entities,interaction,relationship
1,BioInfer_d682,The recombinant material is similar to authent...,"[('T1', 'Individual_protein', 'skeletal actin'...","{'R1': ('T1', 'T3')}",1
3,BioInfer_d595,The common denominator is impaired beta-cateni...,"[('T1', 'Individual_protein', 'beta-catenin', ...","{'R1': ('T1', 'T2'), 'R2': ('T1', 'T3')}",1
8,BioInfer_d538,"Significantly, those actin mutants exhibiting ...","[('T4', 'Gene/protein/RNA', 'actin', (21, 26))...","{'R1': ('T1', 'T2'), 'R2': ('T1', 'T3')}",1
11,BioInfer_d54,Analyses of dynamic light scattering data by s...,"[('T6', 'Individual_protein', 'actin', (129, 1...","{'R1': ('T1', 'T3'), 'R2': ('T2', 'T4'), 'R3':...",1
12,BioInfer_d777,Treatment with HNE resulted in activation of e...,"[('T6', 'Protein_family_or_group', 'extracellu...","{'R1': ('T1', 'T6'), 'R2': ('T2', 'T6')}",1


In [10]:
# Retrieve one of the 825 sample sentences for detailed examination
entities = have_relationship.loc[11, 'entities']
interaction = have_relationship.loc[11, 'interaction']

In [16]:
entities # in string and not ready for analysis

"[('T6', 'Individual_protein', 'actin', (129, 134)), ('T4', 'Individual_protein', 'actin', (157, 162)), ('T5', 'Individual_protein', 'vinculin', (258, 266)), ('T1', 'Individual_protein', 'actin', (248, 253)), ('T3', 'Individual_protein', 'talin', (233, 238)), ('T2', 'Individual_protein', 'vinculin', (81, 89))]"

In [17]:
interaction # in string and not ready for analysis

"{'R1': ('T1', 'T3'), 'R2': ('T2', 'T4'), 'R3': ('T2', 'T6'), 'R4': ('T3', 'T5')}"

In [18]:
# Helper function to convert the entity string into individual components
def extractEntitiesfmDF(txt):
    identity = []
    txt = txt.replace("[","").replace("]","").replace("'","").replace("(","")
    entities = txt.split("), ")
    for entry in entities:
        atoms = entry.replace(")",'').split(", ")
        identity.append((atoms[0], atoms[1], atoms[2], int(atoms[3]), int(atoms[4])))
    return identity
e = extractEntitiesfmDF(entities)
e

[('T6', 'Individual_protein', 'actin', 129, 134),
 ('T4', 'Individual_protein', 'actin', 157, 162),
 ('T5', 'Individual_protein', 'vinculin', 258, 266),
 ('T1', 'Individual_protein', 'actin', 248, 253),
 ('T3', 'Individual_protein', 'talin', 233, 238),
 ('T2', 'Individual_protein', 'vinculin', 81, 89)]

In [19]:
# Helper function to convert the relation string into individual components
from collections import OrderedDict
def extractInteractionfmDF(txt):
    txt = txt.replace("'",'').replace('{','').replace('}','')
    s = [match.start() for match in re.finditer('R', txt)]
    s.append(len(txt)+2) # add the terminal position of the last interaction
    length = len(s) # number of position in the position list
    i = OrderedDict()
    for index in range(length-1):
        start = s[index]
        end = s[index+1]-2
        #print (start, end)
        relationId, pair = txt[start:end].split(': ')
        head, tail = pair.replace('(','').replace(')','').split(', ')
        i[relationId] = (head, tail)
    return i
relations = extractInteractionfmDF(interaction)
relations

OrderedDict([('R1', ('T1', 'T3')),
             ('R2', ('T2', 'T4')),
             ('R3', ('T2', 'T6')),
             ('R4', ('T3', 'T5'))])

In [20]:
sent = have_relationship.loc[11, 'sentence'].replace(',',' ,').replace('.',' .')
print (sent)

Analyses of dynamic light scattering data by stretched exponential fit show that vinculin has a negligible influence on internal actin filament dynamics and actin bending stiffness which contrasts with our previous observations with talin , another actin and vinculin-binding protein from focal adhesions .


In [21]:
import stanza
nlp = stanza.Pipeline('en', package='craft', processors={'ner': 'BioNLP13CG'})




2022-01-18 14:31:42 INFO: Loading these models for language: en (English):
| Processor | Package    |
--------------------------
| tokenize  | craft      |
| pos       | craft      |
| lemma     | craft      |
| depparse  | craft      |
| ner       | bionlp13cg |

2022-01-18 14:31:42 INFO: Use device: cpu
2022-01-18 14:31:42 INFO: Loading: tokenize
2022-01-18 14:31:42 INFO: Loading: pos
2022-01-18 14:31:42 INFO: Loading: lemma
2022-01-18 14:31:43 INFO: Loading: depparse
2022-01-18 14:31:43 INFO: Loading: ner
2022-01-18 14:31:43 INFO: Done loading processors!


In [23]:
doc = nlp(sent)
print(*[f'entity: {ent.text.ljust(18, " ")}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')
# These entities correspond exactly to the original annotated items except the last entry 'focual adhesion'

entity: vinculin          	type: GENE_OR_GENE_PRODUCT
entity: actin             	type: GENE_OR_GENE_PRODUCT
entity: actin             	type: GENE_OR_GENE_PRODUCT
entity: talin             	type: GENE_OR_GENE_PRODUCT
entity: actin             	type: GENE_OR_GENE_PRODUCT
entity: vinculin          	type: GENE_OR_GENE_PRODUCT
entity: focal adhesions   	type: CELLULAR_COMPONENT


In [113]:
print(*[f'id: {word.id}\tword: {word.text.ljust(10, " ")}\tupos: {word.upos.ljust(5, " ")}\thead id: {word.head}\thead: {sent.words[word.head-1].text.ljust(10, " ") if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')


id: 1	word: Analyses  	upos: NOUN 	head id: 11	head: show      	deprel: nsubj
id: 2	word: of        	upos: ADP  	head id: 6	head: data      	deprel: case
id: 3	word: dynamic   	upos: ADJ  	head id: 6	head: data      	deprel: amod
id: 4	word: light     	upos: NOUN 	head id: 6	head: data      	deprel: compound
id: 5	word: scattering	upos: NOUN 	head id: 6	head: data      	deprel: compound
id: 6	word: data      	upos: NOUN 	head id: 1	head: Analyses  	deprel: nmod
id: 7	word: by        	upos: ADP  	head id: 10	head: fit       	deprel: case
id: 8	word: stretched 	upos: VERB 	head id: 10	head: fit       	deprel: amod
id: 9	word: exponential	upos: ADJ  	head id: 10	head: fit       	deprel: amod
id: 10	word: fit       	upos: NOUN 	head id: 1	head: Analyses  	deprel: nmod
id: 11	word: show      	upos: VERB 	head id: 0	head: root	deprel: root
id: 12	word: that      	upos: SCONJ	head id: 14	head: has       	deprel: mark
id: 13	word: vinculin  	upos: NOUN 	head id: 14	head: has       	deprel: nsu

#### This shows the POS and the dependency of indiviudal word processed by default stanza nlp processor.  However, there are words that should be combined to provde better meaning, which it will facilitate the extraction of meaning that makes sense to a human reader. 
> E.g. 'dynamic light scattering data', 'stretched exponential fit', 'internal actin filament dynamics', etc.

#### Some way to re-tokenize the sentence that makes better sense of given context

In [26]:
for sent in doc.sentences:
    for i in range(len(sent.words)-1):
        if sent.words[i].upos == 'ADJ' and sent.words[i+1].upos=='NOUN': # the one of many heuristics to recombine tokens
            print (sent.words[i].text, sent.words[i+1].text)

dynamic light
exponential fit
negligible influence
internal actin
previous observations
focal adhesions


In [24]:
# Helper functions to help re-tokenize the text

def firstPass(doc):
    """ this works on doc that is tokenized by simple split statement and parsed by default stanza nlp processor"""
    retokenized_txt = []
    for sent in doc.sentences:
        count = 0
        while count < len(sent.words):
            if sent.words[count].upos == 'ADJ' and sent.words[count+1].upos=='NOUN':
                #print (sent.words[count].text, sent.words[count].upos, sent.words[count+1].text, sent.words[count+1].upos)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            elif sent.words[count].upos == 'NOUN' and sent.words[count+1].upos=='NOUN':
                #print (sent.words[count].text, sent.words[count].upos, sent.words[count+1].text, sent.words[count+1].upos)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            else:
                #print (sent.words[count].text)
                retokenized_txt.append(sent.words[count].text)
                count +=1
    return retokenized_txt

def secondPass(doc):
    """ this works on doc that is already re-tokenized and parsed by default stanza nlp processor"""
    retokenized_txt = []
    for sent in doc.sentences:
        count = 0
        while count < len(sent.words):
            if sent.words[count].upos == 'ADJ' and sent.words[count+1].upos == 'NOUN':
                #print (count, sent.words[count].text, sent.words[count+1].text)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            elif sent.words[count].upos == 'NOUN' and sent.words[count+1].upos == 'NOUN':
                #print (count, sent.words[count].text, sent.words[count+1].text)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            elif (sent.words[count].upos == 'VERB' and (sent.words[count].xpos == 'VBG' or sent.words[count].xpos == 'VBN')) and sent.words[count+1].upos == 'NOUN':
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                #print (count, sent.words[count].text, sent.words[count+1].text)
                count +=2
            else:
                retokenized_txt.append(sent.words[count].text)
                #print (count, sent.words[count].text)
                count +=1
    return retokenized_txt

#### Full run on the original sentence through two passess that will consolidate words/phrases to a level that give better context to analyse the relationship between 
> 'talin',
> 'actin' and 
> 'vinculin'.

In [28]:
%%time

pretokenized_nlp = stanza.Pipeline(lang='en', package='craft', processor='tokenize', tokenize_pretokenized=True)

pretokenized_sent = have_relationship.loc[11, 'sentence'].replace(',',' ,').replace('.',' .').split(' ')
doc = pretokenized_nlp([pretokenized_sent])

pretokenized_firstPass_sent = firstPass(doc)
after_firstPass_doc = pretokenized_nlp([pretokenized_firstPass_sent])

pretokenized_SecondPass_sent = secondPass(after_firstPass_doc)
after_secondPass_doc = pretokenized_nlp([pretokenized_SecondPass_sent])

print(*[f'id: {word.id}\tword: {word.text.ljust(18, " ")}\tupos: {word.upos.ljust(6, " ")}\thead: {sent.words[word.head-1].text.ljust(20, " ") if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in after_secondPass_doc.sentences for word in sent.words], sep='\n')


2022-01-18 15:10:25 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | craft   |
| pos       | craft   |
| lemma     | craft   |
| depparse  | craft   |

2022-01-18 15:10:25 INFO: Use device: cpu
2022-01-18 15:10:25 INFO: Loading: tokenize
2022-01-18 15:10:25 INFO: Loading: pos
2022-01-18 15:10:25 INFO: Loading: lemma
2022-01-18 15:10:25 INFO: Loading: depparse
2022-01-18 15:10:25 INFO: Done loading processors!


id: 1	word: Analyses          	upos: NOUN  	head: show                	deprel: nsubj
id: 2	word: of                	upos: ADP   	head: dynamic light scattering data	deprel: case
id: 3	word: dynamic light scattering data	upos: NOUN  	head: Analyses            	deprel: nmod
id: 4	word: by                	upos: ADP   	head: stretched exponential fit	deprel: case
id: 5	word: stretched exponential fit	upos: NOUN  	head: Analyses            	deprel: nmod
id: 6	word: show              	upos: VERB  	head: root	deprel: root
id: 7	word: that              	upos: SCONJ 	head: has                 	deprel: mark
id: 8	word: vinculin          	upos: NOUN  	head: has                 	deprel: nsubj
id: 9	word: has               	upos: VERB  	head: show                	deprel: ccomp
id: 10	word: a                 	upos: DET   	head: negligible influence	deprel: det
id: 11	word: negligible influence	upos: NOUN  	head: has                 	deprel: obj
id: 12	word: on                	upos: ADP   	head: inte

In [31]:
# The connection between individual token based on dependency parsing result
for sent in after_secondPass_doc.sentences:
    for word in sent.words:
        print (word.text, ' -> ', sent.words[word.head-1].text)

Analyses  ->  show
of  ->  dynamic light scattering data
dynamic light scattering data  ->  Analyses
by  ->  stretched exponential fit
stretched exponential fit  ->  Analyses
show  ->  .
that  ->  has
vinculin  ->  has
has  ->  show
a  ->  negligible influence
negligible influence  ->  has
on  ->  internal actin filament dynamics
internal actin filament dynamics  ->  negligible influence
and  ->  actin bending stiffness
actin bending stiffness  ->  internal actin filament dynamics
which  ->  contrasts
contrasts  ->  negligible influence
with  ->  previous observations
our  ->  previous observations
previous observations  ->  contrasts
with  ->  talin
talin  ->  previous observations
,  ->  talin
another  ->  actin
actin  ->  talin
and  ->  vinculin-binding protein
vinculin-binding protein  ->  talin
from  ->  focal adhesions
focal adhesions  ->  actin
.  ->  show


#### Next step 
To analyse how the three annotated entities (i.e. talin', 'actin' and 'vinculin') are related and the reason they are related

### Follow-up: Consolidate those Functions and Procedures into a Python Script

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import re
from collections import OrderedDict

def extractEntitiesfmDF(txt):
    """ 
    txt = the string from a pandas dataframe that containes entities identified in original data set
    """
    identity = []
    txt = txt.replace("[","").replace("]","").replace("'","").replace("(","")
    entities = txt.split("), ")
    for entry in entities:
        atoms = entry.replace(")",'').split(", ")
        identity.append((atoms[0], atoms[1], atoms[2], int(atoms[3]), int(atoms[4])))
    return identity

def extractInteractionfmDF(txt):
    """ 
    txt = the string from a pandas dataframe that containes relations(s) identified in original data set
    """
    txt = txt.replace("'",'').replace('{','').replace('}','')
    s = [match.start() for match in re.finditer('R', txt)]
    s.append(len(txt)+2) # add the terminal position of the last interaction
    length = len(s) # number of position in the position list
    interaction_dict = OrderedDict()
    for index in range(length-1):
        start = s[index]
        end = s[index+1]-2
        #print (start, end)
        relationId, pair = txt[start:end].split(': ')
        head, tail = pair.replace('(','').replace(')','').split(', ')
        interaction_dict[relationId] = (head, tail)
    return interaction_dict

def firstPass(doc):
    """ this works on doc that is tokenized by simple split statement and parsed by default stanza nlp processor"""
    retokenized_txt = []
    for sent in doc.sentences:
        count = 0
        while count < len(sent.words):
            if sent.words[count].upos == 'ADJ' and sent.words[count+1].upos=='NOUN':
                #print (sent.words[count].text, sent.words[count].upos, sent.words[count+1].text, sent.words[count+1].upos)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            elif sent.words[count].upos == 'NOUN' and sent.words[count+1].upos=='NOUN':
                #print (sent.words[count].text, sent.words[count].upos, sent.words[count+1].text, sent.words[count+1].upos)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            else:
                #print (sent.words[count].text)
                retokenized_txt.append(sent.words[count].text)
                count +=1
    return retokenized_txt

def secondPass(doc):
    """ this works on doc that is already re-tokenized and parsed by default stanza nlp processor"""
    retokenized_txt = []
    for sent in doc.sentences:
        count = 0
        while count < len(sent.words):
            if sent.words[count].upos == 'ADJ' and sent.words[count+1].upos == 'NOUN':
                #print (count, sent.words[count].text, sent.words[count+1].text)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            elif sent.words[count].upos == 'NOUN' and sent.words[count+1].upos == 'NOUN':
                #print (count, sent.words[count].text, sent.words[count+1].text)
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                count +=2
            elif (sent.words[count].upos == 'VERB' and (sent.words[count].xpos == 'VBG' or sent.words[count].xpos == 'VBN')) and sent.words[count+1].upos == 'NOUN':
                retokenized_txt.append(' '.join([sent.words[count].text, sent.words[count+1].text]))
                #print (count, sent.words[count].text, sent.words[count+1].text)
                count +=2
            else:
                retokenized_txt.append(sent.words[count].text)
                #print (count, sent.words[count].text)
                count +=1
    return retokenized_txt

def tokenizer(nlp, text):
    pretokenized_sent = text.replace(',',' ,').replace('.',' .').split(' ')
    doc = nlp([pretokenized_sent])
    pretokenized_firstPass_sent = firstPass(doc)
    after_firstPass_doc = nlp([pretokenized_firstPass_sent])
    pretokenized_SecondPass_sent = secondPass(after_firstPass_doc)
    after_secondPass_doc = nlp([pretokenized_SecondPass_sent])
    return after_secondPass_doc

def plot_network(graph):
    plt.figure(figsize=(20,20))
    pos = nx.spring_layout(graph)
    nx.draw_networkx_nodes(graph, pos, node_color='r')
    nx.draw_networkx_labels(graph, pos, font_size = 14)
    nx.draw_networkx_edges(graph, pos, edge_color='b')
    plt.show()
        