# Subgraph Analysis (1/2)

This notebook conatins creation of all chemical-disease pairs in our subgraphs

# Pre-requirements

1. Installation of drug2ways
1. Running of earlier notebook (notebook 2,3, and 4) 

# Imports

In [1]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm
from itertools import product
from networkx import DiGraph

from utils import DATA_DIR, get_paths, filter_dataset, KG_DATA_PATH

In [2]:
logger = logging.getLogger(__name__)
logging.getLogger('drug2ways').setLevel(logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)

# Load dataset-generated network dataframe

In [3]:
openbiolink_path = os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv')
custom_path = os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv')

# Load DF
openbiolink_df = pd.read_csv(openbiolink_path, sep='\t')
custom_df = pd.read_csv(custom_path, sep='\t')

# Load datasets

In [4]:
with open(os.path.join(DATA_DIR, 'creeds', 'normalized', 'harmonized_expression.json')) as file:
    creed_dict = json.load(file)
    
with open(os.path.join(DATA_DIR, 'geo', 'normalized', 'harmonized_expression.json')) as file2:
    geo_dict = json.load(file2)
    
with open(os.path.join(DATA_DIR, 'l1000', 'normalized', 'harmonized_expression.json')) as file3:
    l1000_dict = json.load(file3)
    
with open(os.path.join(DATA_DIR, 'open_targets', 'normalized', 'harmonized_expression.json')) as file4:
    open_target_dict = json.load(file4)

# Filterting dataset based on network

In [5]:
# CREED
creed_openbio = filter_dataset(dataset=creed_dict, graph_df=openbiolink_df)
creed_custom = filter_dataset(dataset=creed_dict, graph_df=custom_df)

creed_dict = {'openbio': creed_openbio, 'custom': creed_custom}

# GEO
geo_openbio = filter_dataset(dataset=geo_dict, graph_df=openbiolink_df)
geo_custom = filter_dataset(dataset=geo_dict, graph_df=custom_df)

geo_dict = {'openbio': geo_openbio, 'custom': geo_custom}

# OpenTarget
target_openbio = filter_dataset(dataset=open_target_dict, graph_df=openbiolink_df)
target_custom = filter_dataset(dataset=open_target_dict, graph_df=custom_df)

open_target_dict = {'openbio': target_openbio, 'custom': target_custom}

# L1000
l1000_openbio = filter_dataset(dataset=l1000_dict, graph_df=openbiolink_df)
l1000_custom = filter_dataset(dataset=l1000_dict, graph_df=custom_df)

l1000_dict = {'openbio': l1000_openbio, 'custom': l1000_custom}


# Creating information dict for each chemical-disease pair

In [6]:
MAP = {
    'creed' : creed_dict,
    'target': open_target_dict,
    'geo': geo_dict,
    'l1000': l1000_dict,
}

In [7]:
for c, d in product(['l1000', 'creed'], ['geo', 'target']):
    c_set = MAP[c]
    d_set = MAP[d]
    graph_name = c + '_' + d
    
    print(f'### {c}-{d} ###')
            
    print('Openbiolink')
    get_paths(
        graph_df=openbiolink_df,
        disease_dict=d_set['openbio'],
        chemical_dict=c_set['openbio'],
        graph_name=graph_name,
        openbio=True
    )        

### l1000-geo ###
Openbiolink


100%|██████████| 14472/14472 [1:28:13<00:00,  2.73it/s]   


Skipped 14048 nodes from 14472
### l1000-target ###
Openbiolink


100%|██████████| 37788/37788 [3:23:04<00:00,  3.10it/s]     


Skipped 37156 nodes from 37788
### creed-geo ###
Openbiolink


100%|██████████| 1728/1728 [41:31<00:00,  1.44s/it]  


Skipped 1532 nodes from 1728
### creed-target ###
Openbiolink


100%|██████████| 4512/4512 [59:51<00:00,  1.26it/s]   


Skipped 4211 nodes from 4512
