# Subgraph Analysis (2/2)

This notebook contains analysis of each chemical-disease pair based on the subgraph created and the lmax values.

# Pre-requirements

1. Installation of drug2ways
1. Running notebook 5 - Creates lmax paths between the pairs
1. Running 5.1 analysis notebook - Saves the lmax specific df into files

# Imports

In [1]:
import pandas as pd
import ast
import os
import json
import logging
from tqdm import tqdm
from collections import Counter
from itertools import product

from networkx import DiGraph, connected_components

from utils import DATA_DIR, filter_dataset, get_validated_paths, create_graph_from_df, KG_DATA_PATH

In [2]:
logger = logging.getLogger(__name__)
logging.getLogger('drug2ways').setLevel(logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)

# Load dataset-generated network dataframe

In [3]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)
custom_df.rename(columns={'relation': 'polarity'}, inplace=True)

# Load datasets

In [4]:
with open(os.path.join(DATA_DIR, 'creeds', 'normalized', 'harmonized_expression-new1.json')) as file:
    creed_dict = json.load(file)
    
with open(os.path.join(DATA_DIR, 'geo', 'normalized', 'harmonized_expression-new1.json')) as file2:
    geo_dict = json.load(file2)
    
with open(os.path.join(DATA_DIR, 'l1000', 'normalized', 'harmonized_expression-new1.json')) as file3:
    l1000_dict = json.load(file3)
    
with open(os.path.join(DATA_DIR, 'open_targets', 'normalized', 'harmonized_expression-new1.json')) as file4:
    open_target_dict = json.load(file4)

# Filterting dataset based on network

In [5]:
# CREED
creed_openbio = filter_dataset(dataset=creed_dict, graph_df=openbiolink_df)
creed_custom = filter_dataset(dataset=creed_dict, graph_df=custom_df)

creed_dict = {'openbio': creed_openbio, 'custom': creed_custom}

# GEO
geo_openbio = filter_dataset(dataset=geo_dict, graph_df=openbiolink_df)
geo_custom = filter_dataset(dataset=geo_dict, graph_df=custom_df)

geo_dict = {'openbio': geo_openbio, 'custom': geo_custom}

# OpenTarget
target_openbio = filter_dataset(dataset=open_target_dict, graph_df=openbiolink_df)
target_custom = filter_dataset(dataset=open_target_dict, graph_df=custom_df)

open_target_dict = {'openbio': target_openbio, 'custom': target_custom}

# L1000
l1000_openbio = filter_dataset(dataset=l1000_dict, graph_df=openbiolink_df)
l1000_custom = filter_dataset(dataset=l1000_dict, graph_df=custom_df)

l1000_dict = {'openbio': l1000_openbio, 'custom': l1000_custom}


# Load clinical and FDA data

In [6]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'clinical_pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

In [7]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'fda_pairs.json')) as file1:
    fda_list = json.load(file1)

fda_pairs = set()
for el in fda_list:
    fda_pairs.add(el[0] + '_' + el[1])

# Load the data files

In [8]:
lmax_table = pd.DataFrame(columns=[
    'source', 
    'target', 
    'number_of_paths', 
    'subgraph_name',
    'paths',
])

for file in tqdm(os.listdir(os.path.join(DATA_DIR, 'lmax-dfs'))):
    
    if '5' in file or '6' in file:
        lmax_df = pd.read_csv(
            os.path.join(DATA_DIR, 'lmax-dfs', file), 
            sep='\t'
        )

        lmax_val = file.split('-')[0].split('_')[1]

        lmax_df = lmax_df[lmax_df['number_of_paths'] != 0]
        lmax_df.drop('signs', axis=1, inplace=True)
        lmax_df.drop_duplicates(inplace=True)
        lmax_df['lmax'] = lmax_val
        lmax_table = pd.concat([lmax_table, lmax_df], ignore_index=True)
lmax_table

100%|██████████| 4/4 [00:05<00:00,  1.47s/it]


Unnamed: 0,source,target,number_of_paths,subgraph_name,paths,lmax
0,pubchem.compound:445154,mondo:0018874,38,custom,"[['pubchem.compound:445154', 'ncbigene:207', '...",5
1,pubchem.compound:445154,mondo:0006639,12,custom,"[['pubchem.compound:445154', 'ncbigene:207', '...",5
2,pubchem.compound:445154,mondo:0004979,44,custom,"[['pubchem.compound:445154', 'ncbigene:207', '...",5
3,pubchem.compound:445154,mondo:0004985,18,custom,"[['pubchem.compound:445154', 'ncbigene:207', '...",5
4,pubchem.compound:445154,mondo:0004989,80,custom,"[['pubchem.compound:445154', 'ncbigene:207', '...",5
...,...,...,...,...,...,...
37822,pubchem.compound:5328940,mondo:0007254,9601,custom,"[['pubchem.compound:5328940', 'ncbigene:25', '...",6
37823,pubchem.compound:5328940,mondo:0008170,5072,custom,"[['pubchem.compound:5328940', 'ncbigene:3055',...",6
37824,pubchem.compound:5328940,mondo:0008903,5453,custom,"[['pubchem.compound:5328940', 'ncbigene:818', ...",6
37825,pubchem.compound:5328940,mondo:0008315,1533,custom,"[['pubchem.compound:5328940', 'ncbigene:818', ...",6


# Creating information df for each chemical-disease pair

In [9]:
MAP = {
    'creed' : creed_dict,
    'target': open_target_dict,
    'geo': geo_dict,
    'l1000': l1000_dict,
}

In [10]:
for c, d in product(['creed', 'l1000'], ['geo', 'target']):
    c_set = MAP[c]
    d_set = MAP[d]
    graph_name = c + '_' + d
    
    print(f'### {c}-{d} ###')
    
    df = pd.DataFrame(columns=[
        'source',
        'target',
        'number_of_paths',
        'number_of_concordant_paths',
        'in_clinical_trial',
        'in_fda',
        'number_of_concordant_activatory_paths',
        'number_of_concordant_inhibitory_paths',
        'subgraph_size',
        'number_of_unique_nodes',
        'lmax',
        'subgraph_name',
    ])
    
    if not os.path.exists(os.path.join(DATA_DIR, 'concordant_paths')):
        os.mkdir(os.path.join(DATA_DIR, 'concordant_paths'))
    
    NAME = f'{graph_name}-lmax-threshold-1.0.tsv'
    
    if os.path.exists(NAME):
        continue
        
    # Iterating different chemical-disease pair
    
    for gname in ['openbio', 'custom']:
        print(f'### {gname} ###')
        for chemical, disease in tqdm(
            product(c_set[gname], d_set[gname]),
            total=len(c_set[gname]) * len(d_set[gname])
        ):
            if gname == 'openbio':
                graph_copy = create_graph_from_df(openbiolink_df)
            else:
                graph_copy = create_graph_from_df(custom_df)
            graph = graph_copy.copy()

            final_df = lmax_table[(lmax_table[['source','target']].values == [chemical, disease]).all(axis=1)]

            if not final_df.empty:
                for source, target, _, subgraph_name, paths, lmax in final_df.values:
                    if subgraph_name == gname:
                        results = get_validated_paths(
                            directed_graph=graph,
                            source=source,
                            target=target,
                            all_paths=ast.literal_eval(paths),
                            drug_dict=c_set[gname][source],
                            disease_dict=d_set[gname][target],
                            clinical_pair_dict=clinical_pair_dict,
                            fda_pairs=fda_pairs
                        )

                        if results['number_of_concordant_paths'] != 0:
                            results['lmax'] = lmax
                            
                            tmp_df = pd.DataFrame(results, index=[0])
                            tmp_df['subgraph_name'] = gname
                            df = pd.concat([df, tmp_df], ignore_index=True)

    file_path = os.path.join(DATA_DIR, 'concordant_paths', NAME)

    df.to_csv(file_path, sep='\t', index=False)

  0%|          | 0/1728 [00:00<?, ?it/s]

### creed-geo ###
### openbio ###


100%|██████████| 1728/1728 [13:03<00:00,  2.21it/s]
  0%|          | 0/935 [00:00<?, ?it/s]

### custom ###


100%|██████████| 935/935 [08:57<00:00,  1.74it/s]
  0%|          | 0/4512 [00:00<?, ?it/s]

### creed-target ###
### openbio ###


100%|██████████| 4512/4512 [36:29<00:00,  2.06it/s]
  0%|          | 0/1925 [00:00<?, ?it/s]

### custom ###


100%|██████████| 1925/1925 [19:30<00:00,  1.64it/s]
  0%|          | 0/14472 [00:00<?, ?it/s]

### l1000-geo ###
### openbio ###


100%|██████████| 14472/14472 [1:57:33<00:00,  2.05it/s] 
  0%|          | 0/4964 [00:00<?, ?it/s]

### custom ###


100%|██████████| 4964/4964 [54:46<00:00,  1.51it/s] 
  0%|          | 0/37788 [00:00<?, ?it/s]

### l1000-target ###
### openbio ###


100%|██████████| 37788/37788 [5:03:35<00:00,  2.07it/s]  
  0%|          | 0/10220 [00:00<?, ?it/s]

### custom ###


100%|██████████| 10220/10220 [1:37:44<00:00,  1.74it/s]
