In [16]:
import pandas as pd
from tqdm import tqdm
import rdflib
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network


def get_edges(graph, name):
    return(G.edges([name]))

# Recursive downfill
def downfill(graph: nx.DiGraph, drug: str, disease: str, curr: pd.DataFrame, mondoNodes, inheritanceString: str):
    #print("downfilling treats edge between ", drug, " and ", disease)
    drugInfo = curr[curr['drug ID']==drug].iloc[0]
    
    curr.loc[len(curr.index)] = [drug + "|" + disease, 
                                 mondoNodes[mondoNodes.id==disease]['name'].to_string(index=False), 
                                 drugInfo['drug ID Label'],
                                 drug, 
                                 disease, 
                                 drugInfo['active ingredients in therapy'], 
                                 inheritanceString] 
    children = get_edges(graph, disease)
    if len(children)==0:
        return
    else:
        child_diseases = [x[1] for x in list(children)]
        for d in child_diseases:
            downfill(graph, drug, d, curr, mondo_nodes, inheritanceString + "-->" + mondoNodes[mondoNodes.id==disease]['name'].to_string(index=False))

print("importing indication list...")
indication_list = pd.read_csv("../merge_lists/indicationList.tsv", sep='\t')

print("importing mondo content and filtering nodes...")
mondo_edges = pd.read_csv("mondo_edges.tsv", sep="\t")
mondo_nodes = pd.read_csv("mondo_nodes.tsv", sep="\t")

disease_nodes = mondo_nodes[mondo_nodes.category=='biolink:Disease']


disease_edges = mondo_edges[mondo_edges['subject'].str.contains("MONDO")]
disease_edges = disease_edges[disease_edges['object'].str.contains("MONDO")]
disease_edges = disease_edges[disease_edges['predicate']=="biolink:subclass_of"]

disease_nodes.to_excel("mondo_disease_nodes.xlsx")
disease_edges.to_excel("mondo_disease_edges.xlsx")


G = nx.DiGraph()
print("building graph...")
for idx, row in tqdm(disease_edges.iterrows()):
    G.add_edge(row['object'], row['subject'])

# now we have a directed graph originating at "Disease". Designate "MONDO:0000001" as HEAD.
head = G.nodes['MONDO:0000001']
list_drug_nodes = list(i for i in indication_list['drug ID'])
list_disease_nodes = list(i for i in indication_list['disease IDs'])


# NOTE: child refers to being downhill in the MONDO tree, not to human children.
print("Downfilling based on MONDO hierarchy...")
for idx, disease in tqdm(enumerate(list_disease_nodes)):
    store_dict = {}
    children = get_edges(G, disease)
    if len(children)>0 and len(children) <= 3:
        child_diseases = [x[1] for x in list(children)]
        #print(disease, " has ", len(child_diseases) ," downhill children: ", child_diseases)
        for K in child_diseases:
            downfill(G, list_drug_nodes[idx], K, indication_list, disease_nodes, disease_nodes[disease_nodes.id==disease]['name'].to_string(index=False))

    #if len(children) > 3:
        #print(disease, " omitted from downfilling because too many children.")

#print(drug_nodes)
#print(disease_nodes)

indication_list.to_excel("indicationList_downfilled.xlsx")

importing indication list...
importing mondo content and filtering nodes...
building graph...


38242it [00:00, 46449.31it/s]


Downfilling based on MONDO hierarchy...


15744it [02:56, 89.02it/s] 


In [17]:
print(indication_list)
print(len(set(list(indication_list['drug|disease']))))
print(len(set(list(indication_list['drug ID']))))
print(len(set(list(indication_list['disease IDs']))))

                      drug|disease  \
0            DOID:10017|CHEBI:7915   
1            DOID:12177|CHEBI:4911   
2      DOID:12177|DRUGBANK:DB00028   
3           DOID:14275|CHEBI:16469   
4           DOID:14275|CHEBI:28689   
...                            ...   
37068    CHEBI:30769|MONDO:0958191   
37069    CHEBI:40279|MONDO:0020722   
37070    CHEBI:40279|MONDO:0958191   
37071     CHEBI:6446|MONDO:0020722   
37072     CHEBI:6446|MONDO:0958191   

                                       disease ID labels  \
0                    multiple endocrine neoplasia type 1   
1                       common variable immunodeficiency   
2                       common variable immunodeficiency   
3                                         atrophic vulva   
4                                         atrophic vulva   
...                                                  ...   
37068  nephrolithiasis, calcium oxalate, 2, with or w...   
37069   nephrolithiasis susceptibility caused by SLC26A1   
370

In [12]:
# Filter by diseases in disease list

diseaseList = pd.read_csv('matrix-disease-list.tsv', sep='\t')

diseaseListItems = list(diseaseList['category_class'])
print(diseaseList)

to_drop = []

for idx, row in tqdm(indication_list.iterrows()):
    if row['disease IDs'] not in diseaseListItems:
        to_drop.append(idx)




      category_class                                   label  \
0      MONDO:0017545                      zygodactyly type 4   
1      MONDO:0017544                      zygodactyly type 3   
2      MONDO:0017543                      zygodactyly type 2   
3      MONDO:0012351                      zygodactyly type 1   
4      MONDO:0043988                     zoster sine herpete   
...              ...                                     ...   
18535  MONDO:0019784            12q14 microdeletion syndrome   
18536  MONDO:0017781          12p12.1 microdeletion syndrome   
18537  MONDO:0018632     11q22.2q22.3 microdeletion syndrome   
18538  MONDO:0017580       11p15.4 microduplication syndrome   
18539  MONDO:0017180  10q22.3q23.3 microduplication syndrome   

                                              definition  \
0                                                    NaN   
1                                                    NaN   
2                                                  

37073it [00:05, 6351.09it/s]


In [13]:
indication_list.drop(to_drop, axis=0, inplace=True)


In [14]:
print(indication_list)

                        drug|disease  \
1513   MONDO:0000050|UNII:6D848RA61B   
1514   MONDO:0000050|UNII:8FOJ430U94   
1515   MONDO:0000050|UNII:NQX9KB6PCL   
1516   MONDO:0000050|UNII:OP35X9610Y   
1518        MONDO:0000082|CHEBI:3746   
...                              ...   
37068      CHEBI:30769|MONDO:0958191   
37069      CHEBI:40279|MONDO:0020722   
37070      CHEBI:40279|MONDO:0958191   
37071       CHEBI:6446|MONDO:0020722   
37072       CHEBI:6446|MONDO:0958191   

                                       disease ID labels  \
1513       isolated congenital growth hormone deficiency   
1514       isolated congenital growth hormone deficiency   
1515       isolated congenital growth hormone deficiency   
1516       isolated congenital growth hormone deficiency   
1518                               pelvic organ prolapse   
...                                                  ...   
37068  nephrolithiasis, calcium oxalate, 2, with or w...   
37069   nephrolithiasis susceptibility 

In [15]:
print(len(set(list(indication_list['drug|disease']))))
print(len(set(list(indication_list['drug ID']))))
print(len(set(list(indication_list['disease IDs']))))

22980
1948
4050
