In [13]:
import pandas as pd
from tqdm import tqdm
import rdflib
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

def get_edges(graph, name):
    return(G.edges([name]))

# Recursive downfill
def downfill(graph: nx.DiGraph, drug: str, disease: str, curr: pd.DataFrame, mondoNodes, inheritanceString: str):
    #print("downfilling treats edge between ", drug, " and ", disease)
    drugInfo = curr[curr['drug ID']==drug].iloc[0]
    
    curr.loc[len(curr.index)] = [drug + "|" + disease, 
                                 mondoNodes[mondoNodes.id==disease]['name'].to_string(index=False), 
                                 drugInfo['drug ID Label'],
                                 drug, 
                                 disease, 
                                 drugInfo['active ingredients in therapy'], 
                                 inheritanceString,
                                 True] 
    children = get_edges(graph, disease)
    if len(children)==0:
        return
    else:
        child_diseases = [x[1] for x in list(children)]
        for d in child_diseases:
            downfill(graph, drug, d, curr, mondo_nodes, inheritanceString + "-->" + mondoNodes[mondoNodes.id==disease]['name'].to_string(index=False))

print("importing indication list...")
indication_list = pd.read_csv("../merge_lists/indicationList.tsv", sep='\t')

K = (False for idx, row in indication_list.iterrows())

indication_list['inferred_from_mondo'] = list(K)

print("importing mondo content and filtering nodes...")
mondo_edges = pd.read_csv("mondo_edges.tsv", sep="\t")
mondo_nodes = pd.read_csv("mondo_nodes.tsv", sep="\t")

disease_nodes = mondo_nodes[mondo_nodes.category=='biolink:Disease']

disease_edges = mondo_edges[mondo_edges['subject'].str.contains("MONDO")]
disease_edges = disease_edges[disease_edges['object'].str.contains("MONDO")]
disease_edges = disease_edges[disease_edges['predicate']=="biolink:subclass_of"]

disease_nodes.to_excel("mondo_disease_nodes.xlsx")
disease_edges.to_excel("mondo_disease_edges.xlsx")


G = nx.DiGraph()
print("building graph...")
for idx, row in tqdm(disease_edges.iterrows()):
    G.add_edge(row['object'], row['subject'])

# now we have a directed graph originating at "Disease". Designate "MONDO:0000001" as HEAD.
head = G.nodes['MONDO:0000001']
list_drug_nodes = list(i for i in indication_list['drug ID'])
list_disease_nodes = list(i for i in indication_list['disease IDs'])

# NOTE: child refers to being downhill in the MONDO tree, not to human children.
print("Downfilling based on MONDO hierarchy...")
for idx, disease in tqdm(enumerate(list_disease_nodes), total=len(list_disease_nodes)):
    store_dict = {}
    children = get_edges(G, disease)
    if len(children)>0 and len(children) <= 3:
        child_diseases = [x[1] for x in list(children)]
        #print(disease, " has ", len(child_diseases) ," downhill children: ", child_diseases)
        for K in child_diseases:
            downfill(G, list_drug_nodes[idx], K, indication_list, disease_nodes, disease_nodes[disease_nodes.id==disease]['name'].to_string(index=False))

indication_list.to_excel("indicationList_downfilled.xlsx")

importing indication list...
importing mondo content and filtering nodes...
building graph...


38242it [00:00, 46696.08it/s]


Downfilling based on MONDO hierarchy...


100%|█████████████████████████████████████| 15744/15744 [03:03<00:00, 85.76it/s]


In [14]:
print(len(set(list(indication_list['drug|disease']))))
print(len(set(list(indication_list['drug ID']))))
print(len(set(list(indication_list['disease IDs']))))

34335
2353
6540


In [23]:
# Filter by diseases in disease list
diseaseList = pd.read_csv('matrix-disease-list.tsv', sep='\t')
diseaseListItems = list(diseaseList['category_class'])

in_disease_list = []

for idx, row in tqdm(indication_list.iterrows(), total=len(indication_list)):
    if row['disease IDs'] not in diseaseListItems:
        in_disease_list.append(False)
    else:
        in_disease_list.append(True)

indication_list['in_disease_list'] = in_disease_list
indication_list.columns = ['drug|disease', 'disease_label', 'drug_label', 'drug_ID', 'disease_ID', 'active_ingredients', 'source_text', 'inferred_from_mondo','in_disease_list']
# indication_list.rename("disease ID labels":"disease_label")
# indication_list.rename("disease IDs":"disease_ID")
# indication_list.rename("list of diseases":"source_text")
# indication_list.rename("active ingredients in therapy":"active_ingredients")
# indication_list.rename("drug ID label":"drug_label")
# indication_list.rename("drug ID":"drug_ID")


indication_list.to_excel("indicationList_downfilled.xlsx")

indication_list.to_csv("indicationList.tsv", sep="\t")

  0%|                                                 | 0/37073 [00:00<?, ?it/s]


KeyError: 'disease IDs'

In [24]:
indication_list.to_csv("indicationList.tsv", sep="\t")