In [1]:
from indra_cogex.sources.odinson.grammars import Rule
from indra_cogex.sources.odinson.client import process_rules
import gilda
import pandas as pd
from collections import defaultdict
from gilda.process import normalize
from tqdm.auto import tqdm
from pyobo.gilda_utils import get_gilda_terms
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import textwrap
import random
import difflib

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sangeethavempati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import spine_ner
grounder = spine_ner.grounder


[UBERON] mapping: 100%|██████████████████| 14.5k/14.5k [00:00<00:00, 239kname/s]
[UBERON] mapping: 100%|██████████████| 9.84k/9.84k [00:00<00:00, 72.3ksynonym/s]
[fma] mapping: 100%|█████████████████████| 79.0k/79.0k [00:00<00:00, 252kname/s]
[fma] mapping: 100%|█████████████████| 29.8k/29.8k [00:00<00:00, 90.4ksynonym/s]


In [3]:
from itertools import product
import rule_gen 

directions = rule_gen.directions
advb = rule_gen.advb
noun_case_f = rule_gen.noun_case_f
lemmas_br_br = rule_gen.lemmas_br_br
lemmas_br_ph = rule_gen.lemmas_br_ph
fromto = rule_gen.fromto
from_to_ph = rule_gen.from_to_ph
damage = rule_gen.damage
noun_inputs = rule_gen.noun_inputs
phenotype_f = rule_gen.phenotype_f

binary_rules = []
ph_rules = []

br_br_rules = [rule_gen.create_br_br_rules(noun_type1,noun_input1,lemma,word,noun_type2,noun_input2) for noun_type1,noun_input1,lemma,word,noun_type2,noun_input2 in product(noun_case_f,noun_inputs,lemmas_br_br,fromto,noun_case_f,noun_inputs)]

br_ph_rules = [rule_gen.create_br_ph_rules(region,noun_input,lemma,damage,word,phenotype) for region,noun_input,lemma,damage,word,phenotype in product(noun_case_f,noun_inputs,lemmas_br_ph,damage,from_to_ph,phenotype_f)]


for rule_set in br_br_rules:
    for individual_rule in rule_set:
        binary_rules.append(individual_rule)

for rule_set in br_ph_rules:
    for individual_rule in rule_set:
        ph_rules.append(individual_rule)


print(len(binary_rules))
print(len(ph_rules))
binary_rules = binary_rules[0:200]
ph_rules = binary_rules[0:200]


26244
170100


In [4]:
import stop_words 

sw_nltk = stop_words.sw_nltk
false_phrases = stop_words.false_phrases
exclude_words = stop_words.exclude_words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sangeethavempati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
relations = []
#readable_sentences = []
#go through each rule and make it a rule object
for rule_text in tqdm(binary_rules):
    #print(rule_text)
    rule = Rule("anatomical connection", "Exp", "basic", rule_text)
    #make sure it is a functional Odinson rule
    try:
        rule_output = process_rules([rule],"http://localhost:9000")
    #print(rule_output)
    
    except Exception as e:
        print('failed', rule)
        print(e)
    '''
    with open('failed rules.txt', 'a') as file:
        #file.write('failed' + '\n')
        file.write(str(rule))
    continue
    '''
    #get the start and end characters for each term pulled out by the rule
    for sentence in rule_output['mentions']:
        #print(sentence)
        relation = ()
        words = sentence['words']
        string_words = ' '.join(words)
        #readable_sentences += [string_words]
        for element in sentence['match']:  
            for entity in element['namedCaptures']:
                #print(entity)
                start = entity['capturedMatch']['start']
                end = entity['capturedMatch']['end']
                #remove stop words
                processed_term = [word for word in words[start:end] if word.lower() not in sw_nltk and word.lower() not in false_phrases and word.lower() not in exclude_words]
                word = ' '.join(processed_term)
                #create tuples with curies for terms that can be grounded
                spine_scored_match = grounder.ground(word)
                gilda_scored_match = gilda.ground(word)
                if len(gilda_scored_match)>0:
                    best_curie = gilda_scored_match[0].term.get_curie()
                elif len(spine_scored_match)>0:
                    best_curie = spine_scored_match[0].term.get_curie()
                else:
                    best_curie = None
                relation += ((best_curie, word),)  
        if len(relation) > 1:
            relations.append(relation)

print(len(relations))
#print(readable_sentences)

100%|█████████████████████████████████████████| 200/200 [00:16<00:00, 12.41it/s]

854





In [None]:
#create a sample set of 20 random sentences
sample_set = random.sample(readable_sentences,20)

with open('sample_20.txt', 'w') as f:
    for sentence in sample_set:
        f.write(sentence + '\n')

In [83]:
#create a ranked list of terms
import csv
import numpy as np

'''with open('relations.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(relations)'''
terms = []
for set in relations:
    for term in set:
        terms.append(term[1])

        
new_relations = np.array(terms)

ranked = pd.value_counts(new_relations)

value_counts_df = ranked.reset_index()
value_counts_df.columns = ['Value', 'Count']

with open('relations.csv', 'w') as f:
    value_counts_df.to_csv(f, index=False)

In [None]:
#create a csv file of all relations
with open('relation_full.csv', 'w', newline='') as f:    
    csv_writer = csv.writer(f)

    for relation1, relation2 in relations:
        term_of_inner_tuple1 = relation1[1]
        term_of_inner_tuple2 = relation2[1]
        print(term_of_inner_tuple1,term_of_inner_tuple2)
        csv_writer.writerow([term_of_inner_tuple1, term_of_inner_tuple2])


In [None]:
#create an interaction map of relationships between brain region terms
#!pip install matplotlib
import networkx as nx
import pygraphviz as pgv
import matplotlib.pyplot as plt
G = nx.Graph()
plt.figure(figsize=(20,20))
G.add_edges_from(relations, len=4)

pos = nx.nx_agraph.graphviz_layout(G, prog='neato')
#labels = {n: n[1] for n in G.nodes()}
labels = {}
for k in pos.keys():
    labels[k] = k[1]

'''
for label in labels.values():
    textwrap.wrap(label, width = 10)
    print(label)
'''
#G = nx.relabel_nodes(G, labels)
#nx.draw_networkx_labels(G, pos, labels, font_size=22, font_color="black")
nx.draw_networkx_nodes(G, pos, node_size=100, node_color='white', node_shape='o')
nx.draw_networkx_edges(G, pos, width=1.0, edge_color='grey', style='solid')
labels = nx.draw_networkx_labels(G, pos, labels = labels, font_size=8, font_color='k', font_family='sans-serif', font_weight='normal')
print()

In [None]:
scoredmatches = gilda.ground('ER')
scoredmatches[0].term.get_curie()

In [None]:
benchmark_url = ('https://docs.google.com/spreadsheets/d/e/2PACX-1vS6uvih2Hi7dIo9Nabk5gv2kz67avmHpiWvqtNOKxrr43WhxSCBwzyq'
'lLvi841Vx3f1LoF7GF_5Cff3/pub?output=tsv')
benchmark_df = pd.read_csv(benchmark_url, sep='\t')
subject = gilda.ground_df(benchmark_df, 'subject', grounder=grounder)
object = gilda.ground_df(benchmark_df, 'object', grounder=grounder)

benchmark_df
