# Subgraph Full Analysis

This notebook conatins selection of concordant paths based on both drug and disease transcription data of all chemical-disease pairs in our subgraphs

# Imports

In [1]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm
from itertools import product
from networkx import DiGraph

from utils import (get_paths, filter_dataset, get_validated_paths, create_graph_from_df,
                   get_path_count, DATA_DIR, KG_DATA_PATH)

In [2]:
logger = logging.getLogger(__name__)
logging.getLogger('drug2ways').setLevel(logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)

# Load KG

In [3]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={"relation": "polarity"}, inplace=True)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'),
    sep='\t'
)
custom_df.rename(columns={"relation": "polarity"}, inplace=True)

# Load datasets

In [4]:
with open(os.path.join(DATA_DIR, 'transcriptomics', 'creed_harmonized_expression.json')) as file:
    creed_dict = json.load(file)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'geo_harmonized_expression.json')) as file2:
    geo_dict = json.load(file2)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'lc1000_harmonized_expression.json')) as file3:
    lc1000_dict = json.load(file3)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'target_harmonized_expression.json')) as file4:
    open_target_dict = json.load(file4)

# Filterting dataset based on network

In [5]:
# CREED
creed_openbio = filter_dataset(dataset=creed_dict, graph_df=openbiolink_df)
creed_custom = filter_dataset(dataset=creed_dict, graph_df=custom_df)

creed_dict = {'openbio': creed_openbio, 'custom': creed_custom}

# GEO
geo_openbio = filter_dataset(dataset=geo_dict, graph_df=openbiolink_df)
geo_custom = filter_dataset(dataset=geo_dict, graph_df=custom_df)

geo_dict = {'openbio': geo_openbio, 'custom': geo_custom}

# OpenTarget
target_openbio = filter_dataset(dataset=open_target_dict, graph_df=openbiolink_df)
target_custom = filter_dataset(dataset=open_target_dict, graph_df=custom_df)

open_target_dict = {'openbio': target_openbio, 'custom': target_custom}

# Lc1000
lc1000_openbio = filter_dataset(dataset=lc1000_dict, graph_df=openbiolink_df)
lc1000_custom = filter_dataset(dataset=lc1000_dict, graph_df=custom_df)

lc1000_dict = {'openbio': lc1000_openbio, 'custom': lc1000_custom}


# Load clinical and drug-indication data

In [6]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'clinical-trial.json')) as file:
    clinical_dict = json.load(file).keys()

# Creating information dict for each chemical-disease pair

In [7]:
MAP = {
    'creed' : creed_dict,
    'target': open_target_dict,
    'geo': geo_dict,
    'lc1000': lc1000_dict,
}

In [8]:
for c, d in product(['creed', 'lc1000'], ['target', 'geo']):
    graph_name = c + '_' + d
    
    print(f'### {c}-{d} ###')
    
    df = pd.DataFrame(columns=[
        'source',
        'target',
        'number_of_paths',
        'number_of_concordant_paths',
        'in_clinical_trial',
        'number_of_concordant_activatory_paths',
        'number_of_concordant_inhibitory_paths',
        'subgraph_size',
        'number_of_unique_nodes',
        'lmax',
        'subgraph_name',
    ])
    
    if not os.path.exists(os.path.join(DATA_DIR, 'concordant_paths')):
        os.mkdir(os.path.join(DATA_DIR, 'concordant_paths'))
    
    for gname in ['openbio', 'custom']:
        if gname == 'openbio':
            graph = create_graph_from_df(openbiolink_df)
            kg = graph.copy()
        else:
            graph = create_graph_from_df(custom_df)
            kg = graph.copy()
        
        paths = get_paths(
            graph=kg,
            disease_dict=MAP[d][gname],
            chemical_dict=MAP[c][gname],
        )
        
        # Iterating different chemical-disease pair
        for lmax, p_dict in tqdm(paths.items(), desc='Calculating concordance'):
            for p in p_dict:
                if len(p['paths']) > 0:

                    # Just get the nodes from the path without relations
                    tmp_paths = []

                    for v, l in p['paths'].items():
                        pth = []
                        for k in l:
                            if k in ['-|', '->']:
                                continue
                            else:
                                pth.append(k)
                        tmp_paths.append(pth)

                    chemical = p['source']
                    disease = p['target']


                    results = get_validated_paths(
                        directed_graph=kg,
                        source=chemical,
                        target=disease,
                        all_paths=tmp_paths,
                        drug_dict=MAP[c][gname][chemical],
                        disease_dict=MAP[d][gname][disease],
                        clinical_pair_dict=clinical_dict,
                    )

                    if results['number_of_concordant_paths'] != 0:
                        new_results = {
                            'source': results['source'],
                            'target': results['target'],
                            'number_of_paths': results['number_of_paths'],
                            'number_of_concordant_paths': results['number_of_concordant_paths'],
                            'in_clinical_trial': results['in_clinical_trial'],
                            'number_of_concordant_activatory_paths': results['number_of_concordant_activatory_paths'],
                            'number_of_concordant_inhibitory_paths': results['number_of_concordant_inhibitory_paths'],
                            'subgraph_size': results['subgraph_size'],
                            'number_of_unique_nodes': results['number_of_unique_nodes'],
                            'lmax': lmax,
                            'subgraph_name': gname,
                        }

                        tmp_df = pd.DataFrame(new_results, index=[0])
                        df = pd.concat(
                            [df, tmp_df],
                            ignore_index=True
                        )

    n_file_path = os.path.join(DATA_DIR, 'concordant_paths', f'{graph_name}-results.tsv')
    df.to_csv(n_file_path, sep='\t', index=False)    

### creed-target ###


Getting paths: 100%|██████████| 620/620 [01:02<00:00,  9.96it/s]
Calculating concordance: 100%|██████████| 3/3 [00:01<00:00,  1.67it/s]
Getting paths: 100%|██████████| 1170/1170 [04:40<00:00,  4.16it/s]
Calculating concordance: 100%|██████████| 3/3 [00:12<00:00,  4.11s/it]


### creed-geo ###


Getting paths: 100%|██████████| 310/310 [00:43<00:00,  7.06it/s]
Calculating concordance: 100%|██████████| 3/3 [00:01<00:00,  2.01it/s]
Getting paths: 100%|██████████| 510/510 [02:41<00:00,  3.16it/s]
Calculating concordance: 100%|██████████| 3/3 [00:11<00:00,  3.85s/it]


### lc1000-target ###


Getting paths: 100%|██████████| 3820/3820 [03:56<00:00, 16.16it/s]
Calculating concordance: 100%|██████████| 3/3 [00:04<00:00,  1.45s/it]
Getting paths: 100%|██████████| 7722/7722 [38:23<00:00,  3.35it/s]  
Calculating concordance: 100%|██████████| 3/3 [01:32<00:00, 30.91s/it]


### lc1000-geo ###


Getting paths: 100%|██████████| 1910/1910 [01:58<00:00, 16.07it/s]
Calculating concordance: 100%|██████████| 3/3 [00:04<00:00,  1.53s/it]
Getting paths: 100%|██████████| 3366/3366 [17:59<00:00,  3.12it/s]  
Calculating concordance: 100%|██████████| 3/3 [01:53<00:00, 37.77s/it]
