In [1]:
from indra_cogex.sources.odinson.grammars import Rule
from indra_cogex.sources.odinson.client import process_rules
import gilda
import pandas as pd
from collections import defaultdict
from gilda.process import normalize
from tqdm.auto import tqdm
from pyobo.gilda_utils import get_gilda_terms
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import textwrap
import random
import difflib

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sangeethavempati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#reformat terms
df = pd.read_csv('spine.tsv', sep = '\t', header = None, names = ['name', 'id'])
#for each id, make a list of the corresponding brain regions
id_to_names = defaultdict(list)
for name,identifier in df.values:
    id_to_names[identifier].append(name)
terms = []
#label terms with the same id as synonyms
for identifier, names in id_to_names.items():
    #label everything except the first as synonyms
    name,*synonyms = names
    term = gilda.term.Term(
                norm_text=normalize(name),
                text=name,
                db="spine",
                id=identifier,
                entry_name=name,
                status="name",
                source="spine",
            )
    terms.append(term)
    for synonym in synonyms:
        term = gilda.term.Term(
                norm_text=normalize(synonym),
                text=synonym,
                db="spine",
                id=identifier,
                entry_name=name,
                status="synonym",
                source="spine",
            )
        terms.append(term)
terms.extend(get_gilda_terms('UBERON'))
terms.extend(get_gilda_terms('fma'))
#terms.extend(get_gilda_terms('ncit'))
grounder = gilda.Grounder(terms)
grounder

[UBERON] mapping: 100%|██████████████████| 14.5k/14.5k [00:00<00:00, 222kname/s]
[UBERON] mapping: 100%|██████████████| 9.84k/9.84k [00:00<00:00, 70.9ksynonym/s]
[fma] mapping: 100%|█████████████████████| 79.0k/79.0k [00:00<00:00, 249kname/s]
[fma] mapping: 100%|█████████████████| 29.8k/29.8k [00:00<00:00, 87.9ksynonym/s]


<gilda.grounder.Grounder at 0x2acf17350>

In [3]:
from itertools import product

#adding in adverbs and directions
'''
noun_inputs = ["medial","lateral","superior","posterior","dorsal","ipsilateral","efferent","outer","central","caudal","afferent","contralateral",
              "ventral","frontal","terminal","rostral","inner","anterior","ascending","peripheral","Descending","adjacent",
              "secondary afferent","auditory ascending","Afferent","cortical","spinal","primary","sensory","motor","projection","visual","auditory","sensorimotor","basal","periaqueductal","limbic"
              "tectal","dentate","entorhinal","subcortical","somatosensory","olfactory","isthmic","cingulate","orbitofrontal",
              "Intrahemispheric","geniculocortical","associational","primary mechanosensory","organum","septal","Tectal","dense cortical",
              "Cortical","mesopontine tegmental","primary trigeminal","hypoglossal","occipital","parahippocampal","cerebellar","major",
              "intramedullary","corticofugal","suprageniculate","parvocellular","paraventricular","cortico/-/cortical","temporal","lateral nuclei",
              "centralis","sympatho/-/excitatory","reticulospinal","vestibulospinal","neocortical","reticulospinal","retinofugal","subthalamic",
              "contralateral homologous","neural","trigeminal","vagus","glossopharyngeal","preganglionic","ophthalmic","vestibular primary",
              "perirhinal","maxillar","postrhinal","intrahemispheric","pretectal","hypothalamic","Auditory","rubro/-/spinal","posterolateral"
              "reticulata","pre/-/frontal","preoptic","Pretectal","rubral","cortico/-/","mossy","Spinal","Retinal","amygdalostriatal",
              "interhemispheric","intercollicular","locus","Associational","lateralis","vestibular","caudolateral","multipolar",
              "retinogeniculocortical"]
'''

#adding in adverbs and directions
directions = ("medial|lateral|superior|posterior|dorsal|ipsilateral|efferent|outer|central|caudal|afferent|contralateral|ventral|frontal|"
              "terminal|rostral|inner|anterior|ascending|peripheral|Descending|adjacent|secondary afferent|auditory ascending|Afferent")

advb = ("cortical|spinal|primary|sensory|motor|projection|visual|auditory|sensorimotor|basal|periaqueductal|limbic|tectal|dentate|"
        "entorhinal|subcortical|somatosensory|olfactory|isthmic|cingulate|orbitofrontal|Intrahemispheric|geniculocortical|associational|"
        "primary mechanosensory|organum|septal|Tectal|dense cortical|Cortical|mesopontine tegmental|primary trigeminal|hypoglossal|"
        "occipital|parahippocampal|cerebellar|major|intramedullary|corticofugal|suprageniculate|parvocellular|paraventricular|"
        "cortico/-/cortical|temporal|lateral nuclei|centralis|sympatho/-/excitatory reticulospinal|vestibulospinal|neocortical|"
        "reticulospinal|retinofugal|subthalamic|contralateral homologous|neural|trigeminal|vagus|glossopharyngeal|preganglionic|"
        "ophthalmic|vestibular primary|perirhinal|maxillar|postrhinal|intrahemispheric|pretectal|hypothalamic|Auditory|rubro/-/spinal|"
        "posterolateral|reticulata|pre/-/frontal|preoptic|Pretectal|rubral|cortico/-/|mossy|Spinal|Retinal|amygdalostriatal|"
        "interhemispheric|intercollicular|locus|Associational|lateralis|vestibular|caudolateral|multipolar|retinogeniculocortical")

#add all noun cases to set, loop through, do either nouncase with np, np with nouncase, or nouncase with nouncase
#{}* ensures that advb or direction inserted is optional, includes cases where neither is found
noun_case_f = ["{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "both {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "both {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ and {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ and {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])"]

#namedx_np_f = "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])"
lemmas = ["project", "connect", "pathway"]
fromto = [">nmod_from",">nmod_of",">nmod_to"]
#rule_combos = [namedx_np_f,noun_case_f]
noun_inputs = [directions,advb]


#rule generation
#binary_rules = []

def create_rules(noun_type1,noun_input1,lemma,word,noun_type2,noun_input2):

    #fromto_portion = [">nmod_{}".format(word) for word in fromto]
    #lemma_portion = ["[lemma={}]".format(term) for term in lemmas]
    #entity_types = ['namedx','case']
    #create two word cases for BRs with noun_input of either advb or direction
    if noun_type1 == "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])":
        entity1 = noun_type1.format(noun_input1)
    else:
        entity1 = noun_type1.format(noun_input1,noun_input1)
        
    if noun_type2 == "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])":
        entity2 = noun_type2.format(noun_input2)
    else:
        entity2 = noun_type2.format(noun_input2,noun_input2)

    #create rules with whatever the current case is of each entity
    rule_set = ["{} [lemma={}] {} {}".format(entity1,lemma,word,entity2),
                "{} [] [] [] [lemma={}] {} {}".format(entity1,lemma,word,entity2),
                "[lemma={}] >nmod_from {} >nmod_to {}".format(lemma,entity1,entity2),
                "[lemma={}] >nmod_of {} >nmod_with {}".format(lemma,entity1,entity2),
                "[lemma={}] [] [] [] [] {} >nmod_to {}".format(lemma,entity1,entity2),
                "[lemma={}] >nmod_to {} >nmod_from [] [] {}".format(lemma,entity1,entity2),
                "{} [] [] [] [lemma={}] {} {}".format(entity1,lemma,word,entity2),
                "{} [lemma=afferent] {} {}".format(entity1,word,entity2),
                "{} which [] [] [lemma={}] >nmod_to {}".format(entity1,lemma,entity2)]
    return rule_set
    
    
binary_rules = [create_rules(noun_type1,noun_input1,lemma,word,noun_type2,noun_input2) for noun_type1,noun_input1,lemma,word,noun_type2,noun_input2 in product(noun_case_f,noun_inputs,lemmas,fromto,noun_case_f,noun_inputs)]   

len(binary_rules)
   


2916

In [3]:

#adding in adverbs and directions
directions = ("medial|lateral|superior|posterior|dorsal|ipsilateral|efferent|outer|central|caudal|afferent|contralateral|ventral|frontal|"
              "terminal|rostral|inner|anterior|ascending|peripheral|Descending|adjacent|secondary afferent|auditory ascending|Afferent")

advb = ("cortical|spinal|primary|sensory|motor|projection|visual|auditory|sensorimotor|basal|periaqueductal|limbic|tectal|dentate|"
        "entorhinal|subcortical|somatosensory|olfactory|isthmic|cingulate|orbitofrontal|Intrahemispheric|geniculocortical|associational|"
        "primary mechanosensory|organum|septal|Tectal|dense cortical|Cortical|mesopontine tegmental|primary trigeminal|hypoglossal|"
        "occipital|parahippocampal|cerebellar|major|intramedullary|corticofugal|suprageniculate|parvocellular|paraventricular|"
        "cortico/-/cortical|temporal|lateral nuclei|centralis|sympatho/-/excitatory reticulospinal|vestibulospinal|neocortical|"
        "reticulospinal|retinofugal|subthalamic|contralateral homologous|neural|trigeminal|vagus|glossopharyngeal|preganglionic|"
        "ophthalmic|vestibular primary|perirhinal|maxillar|postrhinal|intrahemispheric|pretectal|hypothalamic|Auditory|rubro/-/spinal|"
        "posterolateral|reticulata|pre/-/frontal|preoptic|Pretectal|rubral|cortico/-/|mossy|Spinal|Retinal|amygdalostriatal|"
        "interhemispheric|intercollicular|locus|Associational|lateralis|vestibular|caudolateral|multipolar|retinogeniculocortical")

#add all noun cases to set, loop through, do either nouncase with np, np with nouncase, or nouncase with nouncase
noun_case_f = ["both {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "both {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP]) and {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ and {}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])",
              "{}* ([chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])/,/ and {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])"]

namedx_np_f = "{}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])"
#projection = "[lemma=/project|connect|pathway/]"
#projection = "[lemma=project|lemma=connect|lemma=pathway]"
#projection = "[lemma=project|lemma=connect|lemma=pathway]"
#fromto = "from|of|to"

'''
projection_lemmas = ["project", "connect", "pathway"]
projection = "[lemma=/{}/]".format("|".join(projection_lemmas))
print(projection)
'''
lemmas = ["project", "connect", "pathway"]

#not included
region_abbrev = ""

#rule generation
binary_rules = []
for fromto in ["from", "of", "to"]:
#    for projection in ["[lemma=project]", "[lemma=connect]", "[lemma=pathway]"]:
        for case in noun_case_f:
            rules = ["{} {} >nmod_{} {}".format(namedx_np_f.format(advb),projection,fromto,namedx_np_f.format(advb)),
                     "{} {} >nmod_{} {}".format(namedx_np_f.format(directions),projection,fromto,namedx_np_f.format(directions)),                         
                     "{} {} >nmod_{} {}".format(case.format(advb,advb),projection,fromto,namedx_np_f.format(advb)),
                     "{} {} >nmod_{} {}".format(case.format(directions,directions),projection,fromto,namedx_np_f.format(directions)),                         
                     "{} {} >nmod_{} {}".format(namedx_np_f.format(advb),projection,fromto,case.format(advb,advb)),
                     "{} {} >nmod_{} {}".format(namedx_np_f.format(directions),projection,fromto,case.format(directions,directions)),                         
                     "{} {} >nmod_{} {}".format(case.format(advb,advb),projection,fromto,case.format(advb,advb)),
                     "{} {} >nmod_{} {}".format(case.format(directions,directions),projection,fromto,case.format(directions,directions)),                         
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(advb),projection,fromto,namedx_np_f.format(advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(directions),projection,fromto,namedx_np_f.format(directions)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(advb,advb),projection,fromto,namedx_np_f.format(advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(directions,directions),projection,fromto,namedx_np_f.format(directions)),
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(advb),projection,fromto,case.format(advb,advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(directions),projection,fromto,case.format(directions,directions)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(advb,advb),projection,fromto,case.format(advb,advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(directions,directions),projection,fromto,case.format(directions,directions)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,namedx_np_f.format(advb),namedx_np_f.format(advb)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,namedx_np_f.format(directions),namedx_np_f.format(directions)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,case.format(advb,advb),namedx_np_f.format(advb)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,case.format(directions,directions),namedx_np_f.format(directions)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,namedx_np_f.format(advb),case.format(advb,advb)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,namedx_np_f.format(directions),case.format(directions,directions)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,case.format(advb,advb),case.format(advb,advb)),
                     "{} >nmod_from {} >nmod_to {}".format(projection,case.format(directions,directions),case.format(directions,directions)),                                        
                     "{} >nmod_of {} >nmod_with {}".format(projection,namedx_np_f.format(advb),namedx_np_f.format(advb)),
                     "{} >nmod_of {} >nmod_with {}".format(projection,namedx_np_f.format(directions),namedx_np_f.format(directions)),
                     "{} >nmod_of {} >nmod_with {}".format(projection,case.format(advb,advb),namedx_np_f.format(advb)),
                     "{} >nmod_of {} >nmod_with {}".format(projection,case.format(directions,directions),namedx_np_f.format(directions)),
                     "{} >nmod_of {} >nmod_with {}".format(projection,namedx_np_f.format(advb),case.format(advb,advb)),
                     "{} >nmod_of {} >nmod_with {}".format(projection,namedx_np_f.format(directions),case.format(directions,directions)),
                     "{} >nmod_of {} >nmod_with {}".format(projection,case.format(advb,advb),case.format(advb,advb)),
                     "{} >nmod_of {} >nmod_with {}".format(projection,case.format(directions,directions),case.format(directions,directions)),    
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,namedx_np_f.format(advb),namedx_np_f.format(advb)),
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,namedx_np_f.format(directions),namedx_np_f.format(directions)),
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,case.format(advb,advb),namedx_np_f.format(advb)),
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,case.format(directions,directions),namedx_np_f.format(directions)),
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,namedx_np_f.format(advb),case.format(advb,advb)),
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,namedx_np_f.format(directions),case.format(directions,directions)),
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,case.format(advb,advb),case.format(advb,advb)),
                     "{} [] [] [] [] {} >nmod_to {}".format(projection,case.format(directions,directions),case.format(directions,directions)),               
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,namedx_np_f.format(advb),namedx_np_f.format(advb)),
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,namedx_np_f.format(directions),namedx_np_f.format(directions)),
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,case.format(advb,advb),namedx_np_f.format(advb)),
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,case.format(directions,directions),namedx_np_f.format(directions)),
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,namedx_np_f.format(advb),case.format(advb,advb)),
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,namedx_np_f.format(directions),case.format(directions,directions)),
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,case.format(advb,advb),case.format(advb,advb)),
                     "{} >nmod_to {} >nmod_from [] [] {}".format(projection,case.format(directions,directions),case.format(directions,directions)),                                      
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(advb),projection,fromto,namedx_np_f.format(advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(directions),projection,fromto,namedx_np_f.format(directions)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(advb,advb),projection,fromto,namedx_np_f.format(advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(directions,directions),projection,fromto,namedx_np_f.format(directions)),
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(advb),projection,fromto,case.format(advb,advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(namedx_np_f.format(directions),projection,fromto,case.format(directions,directions)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(advb,advb),projection,fromto,case.format(advb,advb)),
                     "{} [] [] [] {} >nmod_{} {}".format(case.format(directions,directions),projection,fromto,case.format(directions,directions)),                                     
                     "{} [lemma=afferent] >nmod_{} {}".format(namedx_np_f.format(advb),fromto,namedx_np_f.format(advb)),
                     "{} [lemma=afferent] >nmod_{} {}".format(namedx_np_f.format(directions),fromto,namedx_np_f.format(directions)),
                     "{} [lemma=afferent] >nmod_{} {}".format(case.format(advb,advb),fromto,namedx_np_f.format(advb)),
                     "{} [lemma=afferent] >nmod_{} {}".format(case.format(directions,directions),fromto,namedx_np_f.format(directions)),
                     "{} [lemma=afferent] >nmod_{} {}".format(namedx_np_f.format(advb),fromto,case.format(advb,advb)),
                     "{} [lemma=afferent] >nmod_{} {}".format(namedx_np_f.format(directions),fromto,case.format(directions,directions)),
                     "{} [lemma=afferent] >nmod_{} {}".format(case.format(advb,advb),fromto,case.format(advb,advb)),
                     "{} [lemma=afferent] >nmod_{} {}".format(case.format(directions,directions),fromto,case.format(directions,directions)),                                                      
                     "{} which [] [] {} >nmod_to {}".format(namedx_np_f.format(advb),projection,namedx_np_f.format(advb)),
                     "{} which [] [] {} >nmod_to {}".format(namedx_np_f.format(directions),projection,namedx_np_f.format(directions)),                   
                     "{} which [] [] {} >nmod_to {}".format(case.format(advb,advb),projection,namedx_np_f.format(advb)),
                     "{} which [] [] {} >nmod_to {}".format(case.format(directions,directions),projection,namedx_np_f.format(directions)),                      
                     "{} which [] [] {} >nmod_to {}".format(namedx_np_f.format(advb),projection,case.format(advb,advb)),
                     "{} which [] [] {} >nmod_to {}".format(namedx_np_f.format(directions),projection,case.format(directions,directions)),                     
                     "{} which [] [] {} >nmod_to {}".format(case.format(advb,advb),projection,case.format(advb,advb)),
                     "{} which [] [] {} >nmod_to {}".format(case.format(directions,directions),projection,case.format(directions,directions))]
            for rule in rules:
                addrule = rule
                binary_rules.append(addrule)


print(len(binary_rules))

'''
for rule in binary_rules:
    print(rule)
'''

 

[lemma=/project|connect|pathway/]
1728


'\nfor rule in binary_rules:\n    print(rule)\n'

In [6]:

projection_lemmas = ["project", "connect", "pathway"]

# Construct individual rules for each lemma
projection_rules = ["[lemma={}] >nmod_from {}".format(lemma, namedx_np_f) for lemma in projection_lemmas]
print(projection_rules)

# Combine the rules with logical OR
combined_rule = " | ".join(projection_rules)

# Complete Odinson rule
odinson_rule = "{} {}".format(namedx_np_f, combined_rule)


['[lemma=project] >nmod_from {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])', '[lemma=connect] >nmod_from {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])', '[lemma=pathway] >nmod_from {}* (?<region> [chunk=B-NP]|[chunk=I-NP]|[chunk=B-NP][chunk=I-NP])']


In [4]:
#add in stopwords
sw_nltk = stopwords.words('english')
#print(sw_nltk)


In [5]:
relations = []
readable_sentences = []
#go through each rule and make it a rule object
for rule_text in tqdm(binary_rules):
    #print(rule_text)
    rule = Rule("anatomical connection", "Exp", "basic", rule_text)
    #make sure it is a functional Odinson rule
    try:
        rule_output = process_rules([rule],"http://localhost:9000")
        #print(rule_output)
    except Exception as e:
        print('failed', rule)
        print(e)
        continue
    #get the start and end characters for each term pulled out by the rule
    for sentence in rule_output['mentions']:
        #print(sentence)
        relation = ()
        words = sentence['words']
        string_words = ' '.join(words)
        readable_sentences += [string_words]
        for element in sentence['match']:  
            for entity in element['namedCaptures']:
                #print(entity)
                start = entity['capturedMatch']['start']
                end = entity['capturedMatch']['end']
                #remove stop words
                processed_term = [word for word in words[start:end] if word.lower() not in sw_nltk]
                word = ' '.join(processed_term)
                #create tuples with curies for terms that can be grounded
                spine_scored_match = grounder.ground(word)
                gilda_scored_match = gilda.ground(word)
                if len(gilda_scored_match)>0:
                    best_curie = gilda_scored_match[0].term.get_curie()
                elif len(spine_scored_match)>0:
                    best_curie = spine_scored_match[0].term.get_curie()
                else:
                    best_curie = None
                relation += ((best_curie, word),)  
        if len(relation) > 1:
            relations.append(relation)

print(len(relations))
#print(readable_sentences)

  7%|██▊                                   | 216/2916 [00:00<00:02, 1145.49it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

 17%|██████▎                               | 485/2916 [00:00<00:01, 1295.62it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

 26%|█████████▉                            | 760/2916 [00:00<00:01, 1287.16it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid cras

In [13]:
'''
set_1728 = []
for sentence in readable_sentences:
    set_1728 += [sentence]

with open('all_sentences_1728.txt', 'a') as dfbig:
    for sentence in readable_sentences:
        dfbig.write(sentence + '\n')
'''
set_5184 = []
for sentence in readable_sentences:
    set_5184 += [sentence]

with open('all_sentences_5184.txt', 'a') as dfbig:
    for sentence in readable_sentences:
        dfbig.write(sentence + '\n')



In [14]:
not_in = []
for sentence in set_5184:
    if sentence not in set_1728:
        not_in += [sentence]

with open('not_in_big.txt', 'a') as file:
    for sentence in not_in:
        file.write(sentence + '\n')

In [None]:
'''
#dfbig = pd.read_csv('all_sentencesbig.txt')
#dfsmall = pd.read_csv('all_sentences_1728.txt')

with open('all_sentencesbig.txt', 'r') as dfbig:
    dfbig_text = dfbig.readlines()
with open('all_sentences_1728.txt', 'r') as dfsmall:
    dfsmall_text = dfsmall.readlines()    

'''

In [None]:
sample_set = random.sample(readable_sentences,20)

'''
for sentence in sample_set:
    print(sentence)
'''

with open('sample_20.txt', 'w') as f:
    for sentence in sample_set:
        f.write(sentence + '\n')



In [None]:
#create a ranked list of terms
import csv
'''with open('relations.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(relations)'''
terms = []
for set in relations:
    for term in set:
        terms.append(term[1])

        
new_relations = np.array(terms)

ranked = pd.value_counts(new_relations)

value_counts_df = ranked.reset_index()
value_counts_df.columns = ['Value', 'Count']

with open('relations.csv', 'w') as f:
    value_counts_df.to_csv(f, index=False)

In [None]:

with open('relation_full.csv', 'w', newline='') as f:    
    csv_writer = csv.writer(f)

    for relation1, relation2 in relations:
        term_of_inner_tuple1 = relation1[1]
        term_of_inner_tuple2 = relation2[1]
        print(term_of_inner_tuple1,term_of_inner_tuple2)
        csv_writer.writerow([term_of_inner_tuple1, term_of_inner_tuple2])


In [None]:
#create an interaction map of relationships between brain region terms
#!pip install matplotlib
import networkx as nx
import pygraphviz as pgv
import matplotlib.pyplot as plt
G = nx.Graph()
plt.figure(figsize=(20,20))
G.add_edges_from(relations, len=4)

pos = nx.nx_agraph.graphviz_layout(G, prog='neato')
#labels = {n: n[1] for n in G.nodes()}
labels = {}
for k in pos.keys():
    labels[k] = k[1]

'''
for label in labels.values():
    textwrap.wrap(label, width = 10)
    print(label)
'''
#G = nx.relabel_nodes(G, labels)
#nx.draw_networkx_labels(G, pos, labels, font_size=22, font_color="black")
nx.draw_networkx_nodes(G, pos, node_size=100, node_color='white', node_shape='o')
nx.draw_networkx_edges(G, pos, width=1.0, edge_color='grey', style='solid')
labels = nx.draw_networkx_labels(G, pos, labels = labels, font_size=8, font_color='k', font_family='sans-serif', font_weight='normal')
print()

In [None]:
scoredmatches = gilda.ground('ER')
scoredmatches[0].term.get_curie()

In [None]:
benchmark_url = ('https://docs.google.com/spreadsheets/d/e/2PACX-1vS6uvih2Hi7dIo9Nabk5gv2kz67avmHpiWvqtNOKxrr43WhxSCBwzyq'
'lLvi841Vx3f1LoF7GF_5Cff3/pub?output=tsv')
benchmark_df = pd.read_csv(benchmark_url, sep='\t')
subject = gilda.ground_df(benchmark_df, 'subject', grounder=grounder)
object = gilda.ground_df(benchmark_df, 'object', grounder=grounder)

benchmark_df
