In [67]:
import sys
sys.path.append('../lib/BioInfer_software_1.0.1_Python3/')
from BIParser import BIParser

import os
os.environ['DGL_BACKEND'] = "pytorch" 
import dgl
import torch as th

from collections import OrderedDict

In [41]:
parser = BIParser()

In [42]:
with open('../data/BioInfer_corpus_1.1.1.xml','r') as f:
    parser.parse(f)

Experimentation with DGL, can ignore:

In [43]:
# graph_data = {
#    ('drug', 'interacts', 'drug'): (th.tensor([0, 1]), th.tensor([1, 2])),
#    ('drug', 'interacts', 'gene'): (th.tensor([0, 1]), th.tensor([2, 3])),
#    ('drug', 'treats', 'disease'): (th.tensor([1]), th.tensor([2]))
# }

# graph = dgl.heterograph(graph_data)

# create graph for each example
# types of edges are

# graph_data = {
#    ('drug', 'INHIBITS', 'gene'): (th.tensor([0, 1]), th.tensor([1, 2])),
#    ('drug', 'PROMOTES', 'gene'): (th.tensor([0, 1]), th.tensor([2, 3])),
#    ('gene', 'PROMOTES', 'gene'): (th.tensor([1]), th.tensor([2]))
# }

# graph

#### Extract relevant info from parsed BioInfer data

In [73]:
texts = []
BIO_labels = []
tuple_labels = []


sentences = parser.bioinfer.sentences.sentences
for s in sentences:
    tokens = [t.getText() for t in s.tokens]
    
    # initialize Beginning Inside Outside (BIO) encoding for entities, maybe useful if we want to add NER as joint task
    NER_labels = ['O' for t in s.tokens] 
    
    # GET DISJOINT SETS OF NESTED AND TOP-LEVEL ENTITIES
    #  - I don't think there is an e.isNested() type of 
    # method so this is how I ended up checking
    nested_entities = set()
    for e in s.entities:
        for n in e.nestedEntities:
            nested_entities.add(n)

    top_level_entities = set()
    for e in s.entities:
        if e not in nested_entities:
            top_level_entities.add(e)
    
    #check that sets are disjoint and cover the full set of entities in the sentence
    disjoint = len(nested_entities.intersection(top_level_entities)) == 0
    matching_len = len(nested_entities) + len(top_level_entities) == len(s.entities)
    assert disjoint and matching_len
    
    
    
    # CREATE TUPLES OF (<entity_class>, (<entity_token_1>,<entity_token_2>,...)
    # note that this ignores labels starting with "RELATIONSHIP_"
    entity_tuples = []
    
    for e in nested_entities:
        entity_type = e.type.name
        if 'RELATIONSHIP' not in entity_type:
            entity_tuples.append((entity_type,tuple([st.token.sequence for st in e.subTokens])))
            
    for e in top_level_entities:
        entity_type = e.type.name
        if 'RELATIONSHIP' not in entity_type:
            entity_tuples.append((entity_type,tuple([st.token.sequence for st in e.subTokens])))    
    
    # PERFORM BIO ENCODING FOR TOP-LEVEL ENTITIES
    for e in top_level_entities:
        prefix = 'B-'
        entity_type = e.type.name
        if 'RELATIONSHIP' not in entity_type:
            for t in list(OrderedDict.fromkeys([st.token for st in e.subTokens])):
                NER_labels[t.sequence] = prefix + entity_type
                prefix = 'I-'
                
    texts.append(tokens)
    BIO_labels.append(NER_labels)
    tuple_labels.append(entity_tuples)

#### inspect results

In [75]:
texts[0]

['alpha-catenin',
 'inhibits',
 'beta-catenin',
 'signaling',
 'by',
 'preventing',
 'formation',
 'of',
 'a',
 'beta-catenin*T-cell',
 'factor*DNA',
 'complex',
 '.']

In [76]:
tuple_labels[0]

[('Individual_protein', (9, 10)),
 ('Individual_protein', (9,)),
 ('Individual_protein', (2,)),
 ('Protein_complex', (9, 9, 9, 10, 10, 11)),
 ('Function_property', (2, 3)),
 ('Individual_protein', (0,))]

In [77]:
BIO_labels[0]

['B-Individual_protein',
 'O',
 'B-Function_property',
 'I-Function_property',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Protein_complex',
 'I-Protein_complex',
 'I-Protein_complex',
 'O']

In [90]:
for s in sentences:
    for f in s.formulas:
        print(f.rootNode.arguments)

[<BasicClasses.EntityNode object at 0x7fd0ab087110>, <BasicClasses.EntityNode object at 0x7fd0ab087150>]
[<BasicClasses.RelNode object at 0x7fd0ab087210>, <BasicClasses.EntityNode object at 0x7fd0ab087350>]
[<BasicClasses.RelNode object at 0x7fd0ab087410>, <BasicClasses.EntityNode object at 0x7fd0ab087550>]
[<BasicClasses.EntityNode object at 0x7fd0ab0a8850>, <BasicClasses.EntityNode object at 0x7fd0ab0a8890>]
[<BasicClasses.EntityNode object at 0x7fd0ab0a8950>, <BasicClasses.EntityNode object at 0x7fd0ab0a8990>]
[<BasicClasses.EntityNode object at 0x7fd0aaf3f290>, <BasicClasses.EntityNode object at 0x7fd0aaf3f2d0>]
[<BasicClasses.EntityNode object at 0x7fd0aaf2c310>, <BasicClasses.EntityNode object at 0x7fd0aaf2c350>]
[<BasicClasses.RelNode object at 0x7fd0aaf2c410>]
[<BasicClasses.RelNode object at 0x7fd0aaf2c550>]
[<BasicClasses.RelNode object at 0x7fd0aaf2c690>]
[<BasicClasses.RelNode object at 0x7fd0aaf2c7d0>]
[<BasicClasses.RelNode object at 0x7fd0aaf2c910>]
[<BasicClasses.RelNod

[<BasicClasses.RelNode object at 0x7fd0b18dfe50>]
[<BasicClasses.EntityNode object at 0x7fd0b18f2450>, <BasicClasses.EntityNode object at 0x7fd0b18f2490>]
[<BasicClasses.EntityNode object at 0x7fd0b18f2550>, <BasicClasses.EntityNode object at 0x7fd0b18f2590>]
[<BasicClasses.EntityNode object at 0x7fd0b18f2650>, <BasicClasses.EntityNode object at 0x7fd0b18f2690>]
[<BasicClasses.RelNode object at 0x7fd0b18f2750>]
[<BasicClasses.EntityNode object at 0x7fd0b19052d0>, <BasicClasses.EntityNode object at 0x7fd0b1905310>]
[<BasicClasses.EntityNode object at 0x7fd0b19053d0>, <BasicClasses.EntityNode object at 0x7fd0b1905410>]
[<BasicClasses.EntityNode object at 0x7fd0b19054d0>, <BasicClasses.EntityNode object at 0x7fd0b1905510>]
[<BasicClasses.EntityNode object at 0x7fd0b19055d0>, <BasicClasses.EntityNode object at 0x7fd0b1905610>]
[<BasicClasses.EntityNode object at 0x7fd0b19056d0>, <BasicClasses.EntityNode object at 0x7fd0b1905710>]
[<BasicClasses.EntityNode object at 0x7fd0b19057d0>, <BasicC

[<BasicClasses.EntityNode object at 0x7fd093f84210>, <BasicClasses.EntityNode object at 0x7fd093f84250>]
[<BasicClasses.EntityNode object at 0x7fd093f8e750>, <BasicClasses.EntityNode object at 0x7fd093f8e790>]
[<BasicClasses.EntityNode object at 0x7fd093fa0590>, <BasicClasses.EntityNode object at 0x7fd093fa05d0>]
[<BasicClasses.EntityNode object at 0x7fd093fa0690>, <BasicClasses.EntityNode object at 0x7fd093fa06d0>]
[<BasicClasses.EntityNode object at 0x7fd093fa0790>, <BasicClasses.RelNode object at 0x7fd093fa07d0>]
[<BasicClasses.EntityNode object at 0x7fd093fe5a50>, <BasicClasses.EntityNode object at 0x7fd093fe5a90>]
[<BasicClasses.EntityNode object at 0x7fd093fe5b50>, <BasicClasses.EntityNode object at 0x7fd093fe5b90>]
[<BasicClasses.EntityNode object at 0x7fd093fe5c50>, <BasicClasses.EntityNode object at 0x7fd093fe5c90>]
[<BasicClasses.EntityNode object at 0x7fd093fe5d50>, <BasicClasses.EntityNode object at 0x7fd093fe5d90>]
[<BasicClasses.EntityNode object at 0x7fd094000310>, <Basi

In [93]:
e.formulaNodesUsingMe

[<BasicClasses.RelNode at 0x7fd0945bc110>]