In [4]:
import pandas as pd
from tqdm import tqdm
import rdflib
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network


def get_edges(graph, name):
    return(G.edges([name]))

# Recursive downfill
def downfill(graph: nx.DiGraph, drug: str, disease: str, curr: pd.DataFrame, mondoNodes, inheritanceString: str):
    #print("downfilling treats edge between ", drug, " and ", disease)
    drugInfo = curr[curr['drug ID']==drug].iloc[0]
    
    curr.loc[len(curr.index)] = [drug + "|" + disease, 
                                 mondoNodes[mondoNodes.id==disease]['name'].to_string(index=False), 
                                 drugInfo['drug ID Label'],
                                 drug, 
                                 disease, 
                                 drugInfo['active ingredients in therapy'], 
                                 inheritanceString] 
    children = get_edges(graph, disease)
    if len(children)==0:
        return
    else:
        child_diseases = [x[1] for x in list(children)]
        for d in child_diseases:
            downfill(graph, drug, d, curr, mondo_nodes, inheritanceString + "-->" + mondoNodes[mondoNodes.id==disease]['name'].to_string(index=False))

print("importing indication list...")
indication_list = pd.read_csv("../merge_lists/indicationList.tsv", sep='\t')

print("importing mondo content and filtering nodes...")
mondo_edges = pd.read_csv("mondo_edges.tsv", sep="\t")
mondo_nodes = pd.read_csv("mondo_nodes.tsv", sep="\t")

disease_nodes = mondo_nodes[mondo_nodes.category=='biolink:Disease']


disease_edges = mondo_edges[mondo_edges['subject'].str.contains("MONDO")]
disease_edges = disease_edges[disease_edges['object'].str.contains("MONDO")]
disease_edges = disease_edges[disease_edges['predicate']=="biolink:subclass_of"]

disease_nodes.to_excel("mondo_disease_nodes.xlsx")
disease_edges.to_excel("mondo_disease_edges.xlsx")


G = nx.DiGraph()
print("building graph...")
for idx, row in tqdm(disease_edges.iterrows()):
    G.add_edge(row['object'], row['subject'])

# now we have a directed graph originating at "Disease". Designate "MONDO:0000001" as HEAD.
head = G.nodes['MONDO:0000001']
list_drug_nodes = list(i for i in indication_list['drug ID'])
list_disease_nodes = list(i for i in indication_list['disease IDs'])


# NOTE: child refers to being downhill in the MONDO tree, not to human children.
print("Downfilling based on MONDO hierarchy...")
for idx, disease in tqdm(enumerate(list_disease_nodes)):
    store_dict = {}
    children = get_edges(G, disease)
    if len(children)>0 and len(children) <= 5:
        child_diseases = [x[1] for x in list(children)]
        #print(disease, " has ", len(child_diseases) ," downhill children: ", child_diseases)
        for K in child_diseases:
            downfill(G, list_drug_nodes[idx], K, indication_list, disease_nodes, disease_nodes[disease_nodes.id==disease]['name'].to_string(index=False))

    #if len(children) > 3:
        #print(disease, " omitted from downfilling because too many children.")

#print(drug_nodes)
#print(disease_nodes)

importing indication list...
importing mondo content and filtering nodes...
building graph...


38242it [00:00, 45598.50it/s]


Downfilling based on MONDO hierarchy...


15744it [16:27, 15.94it/s] 


In [23]:
print(type(disease_nodes))
print(disease_nodes[disease_nodes.id=="MONDO:0000001"]['name'].to_string(index=False)) 


<class 'pandas.core.frame.DataFrame'>
disease


In [3]:
indication_list.to_excel("indicationList_downfilled.xlsx")


In [11]:
print(G.edges('MONDO:0000001'))


[('MONDO:0000001', 'MONDO:0005583'), ('MONDO:0000001', 'MONDO:0700096')]


In [14]:
E = list(G.edges)
print(len(E))

38242
