# Transcriptomics-specific Analysis

This notebook contains creation of all chemical-disease pairs in our subgraphs based on specific-transcriptomic data

# Pre-requirements

1. Installation of drug2ways
1. Running of earlier notebook (notebook 2,3, and 5) 

# Imports

In [1]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm
from itertools import product
from networkx import DiGraph

from utils import (get_paths, filter_dataset, 
                   get_transcriptomic_paths, create_graph_from_df,
                   get_path_count, DATA_DIR)

In [2]:
logger = logging.getLogger(__name__)
logging.getLogger('drug2ways').setLevel(logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)

# Load dataset-generated network dataframe

In [3]:
openbiolink_path = os.path.join(DATA_DIR, 'kg', 'normalized', 'openbiolink_kg_normalized.tsv')
custom_path = os.path.join(DATA_DIR, 'kg', 'normalized', 'custom_kg_normalized.tsv')

# Load DF
openbiolink_df = pd.read_csv(openbiolink_path, sep='\t')
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_df = pd.read_csv(custom_path, sep='\t')
custom_df.rename(columns={'relation': 'polarity'}, inplace=True)

# Load datasets

In [4]:
with open(os.path.join(DATA_DIR, 'creeds', 'normalized', 'harmonized_expression.json')) as file:
    creed_dict = json.load(file)
    
with open(os.path.join(DATA_DIR, 'geo', 'normalized', 'harmonized_expression.json')) as file2:
    geo_dict = json.load(file2)
    
with open(os.path.join(DATA_DIR, 'l1000', 'normalized', 'harmonized_expression.json')) as file3:
    l1000_dict = json.load(file3)
    
with open(os.path.join(DATA_DIR, 'open_targets', 'normalized', 'harmonized_expression.json')) as file4:
    open_target_dict = json.load(file4)

# Filterting dataset based on network

In [5]:
# CREED
creed_openbio = filter_dataset(dataset=creed_dict, graph_df=openbiolink_df)
creed_custom = filter_dataset(dataset=creed_dict, graph_df=custom_df)

creed_dict = {'openbio': creed_openbio, 'custom': creed_custom}

# GEO
geo_openbio = filter_dataset(dataset=geo_dict, graph_df=openbiolink_df)
geo_custom = filter_dataset(dataset=geo_dict, graph_df=custom_df)

geo_dict = {'openbio': geo_openbio, 'custom': geo_custom}

# OpenTarget
target_openbio = filter_dataset(dataset=open_target_dict, graph_df=openbiolink_df)
target_custom = filter_dataset(dataset=open_target_dict, graph_df=custom_df)

open_target_dict = {'openbio': target_openbio, 'custom': target_custom}

# L1000
l1000_openbio = filter_dataset(dataset=l1000_dict, graph_df=openbiolink_df)
l1000_custom = filter_dataset(dataset=l1000_dict, graph_df=custom_df)

l1000_dict = {'openbio': l1000_openbio, 'custom': l1000_custom}


# Load clinical and drug-indication data

In [6]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-clinical-pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

In [7]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-indications.json')) as file:
    indication_pair_dict = json.load(file).keys()

# Analysis path for 

# Creating information dict for each chemical-disease pair

In [8]:
MAP = {
    'creed' : creed_dict,
    'target': open_target_dict,
    'geo': geo_dict,
    'l1000': l1000_dict,
}

In [9]:
for c, d in product(['creed', 'l1000'], ['target', 'geo']):
    c_set = MAP[c]
    d_set = MAP[d]
    graph_name = c + '_' + d
    print(graph_name)
    
    print(f'### {c}-{d} ###')
    
    df = pd.DataFrame(columns=[
        'source',
        'target',
        'number_of_paths',
        'number_of_concordant_paths',
        'in_clinical_trial',
        'in_drug_indication',
        'number_of_concordant_activatory_paths',
        'number_of_concordant_inhibitory_paths',
        'subgraph_size',
        'number_of_unique_nodes',
        'lmax',
        'subgraph_name',
    ])
    
    if not os.path.exists(os.path.join(DATA_DIR, 'concordant_paths')):
        os.mkdir(os.path.join(DATA_DIR, 'concordant_paths'))
    
    NAME = f'{graph_name}-.tsv'
    file_path = os.path.join(DATA_DIR, 'concordant_paths', NAME)
    
    if os.path.exists(file_path):
        continue
        
    # Iterating different chemical-disease pair
    final_data = {
        'drug': df,
        'disease': df
    }
        
    for gname in ['openbio', 'custom']:
        if gname == 'openbio':
            flag = True
            kg_df = openbiolink_df
        else:
            flag = False
            kg_df = custom_df
        
        paths = get_paths(
            graph_df=kg_df,
            disease_dict=d_set[gname],
            chemical_dict=c_set[gname],
            graph_name=graph_name,
            openbio=flag
        )
        
        if paths is None:
            continue
        
        graph_copy = create_graph_from_df(kg_df)
        graph = graph_copy.copy()

        for lmax, p_dict in tqdm(paths.items(), desc='Calculating concordance'):
            for p in p_dict:
                if len(p['paths']) > 0:
                    # Just get the nodes from the path
                    tmp_paths = []
                    for v, l in p['paths'].items():
                        pth = []
                        for k in l:
                            if k in ['-|', '->']:
                                continue
                            else:
                                pth.append(k)
                        tmp_paths.append(pth)

                    chemical = p['source']
                    disease = p['target']

                    results = get_transcriptomic_paths(
                        directed_graph=graph,
                        source=chemical,
                        target=disease,
                        all_paths=tmp_paths,
                        drug_dict=c_set[gname][chemical],
                        disease_dict=d_set[gname][disease],
                        clinical_pair_dict=clinical_pair_dict,
                        drug_indication_dict=indication_pair_dict,
                    )
                    
                    # For drug data
                    for i in ['drug_paths', 'disease_paths']:
                        concordant_num = len(results[i])
                        if concordant_num != 0:
                            activated_paths, inhibited_paths = get_path_count(
                                directed_graph=graph,
                                filtered_paths=results[i]
                            )
                            
                            new_results = {
                                'source': results['source'],
                                'target': results['target'],
                                'number_of_paths': results['number_of_paths'],
                                'number_of_concordant_paths': concordant_num,
                                'in_clinical_trial': results['in_clinical_trial'],
                                'in_drug_indication': results['in_drug_indication'],
                                'number_of_concordant_activatory_paths': activated_paths,
                                'number_of_concordant_inhibitory_paths': inhibited_paths,
                                'subgraph_size': results['subgraph_size'],
                                'number_of_unique_nodes': results['number_of_unique_nodes'],
                                'lmax': lmax,
                                'subgraph_name': gname,
                            }

                            tmp_df = pd.DataFrame(new_results, index=[0])
                            if i == 'drug_paths':
                                final_data['drug'] = pd.concat(
                                    [final_data['drug'], tmp_df],
                                    ignore_index=True
                                )
                            else:
                                final_data['disease'] = pd.concat(
                                    [final_data['disease'], tmp_df],
                                    ignore_index=True
                                )
                        
    for i, val in final_data.items():
        n_file_path = os.path.join(DATA_DIR, 'concordant_paths', f'{graph_name}-{i}.tsv')
        val.to_csv(n_file_path, sep='\t', index=False)    

Loading graph: 100%|██████████| 48878/48878 [00:00<00:00, 385178.96it/s]

creed_target
### creed-target ###



100%|██████████| 4512/4512 [06:53<00:00, 10.90it/s]
Calculating concordance: 100%|██████████| 5/5 [00:07<00:00,  1.57s/it]
Loading graph: 100%|██████████| 52182/52182 [00:00<00:00, 376019.08it/s]
100%|██████████| 1925/1925 [08:20<00:00,  3.84it/s]
Calculating concordance: 100%|██████████| 5/5 [00:24<00:00,  4.81s/it]
Loading graph: 100%|██████████| 48878/48878 [00:00<00:00, 424284.52it/s]

creed_geo
### creed-geo ###



100%|██████████| 1728/1728 [02:50<00:00, 10.12it/s]
Calculating concordance: 100%|██████████| 5/5 [00:13<00:00,  2.72s/it]
Loading graph: 100%|██████████| 52182/52182 [00:00<00:00, 392322.96it/s]
100%|██████████| 935/935 [05:23<00:00,  2.89it/s]
Calculating concordance: 100%|██████████| 5/5 [00:49<00:00,  9.87s/it]
Loading graph: 100%|██████████| 48878/48878 [00:00<00:00, 416777.68it/s]


l1000_target
### l1000-target ###


100%|██████████| 37788/37788 [30:02<00:00, 20.96it/s]  
Calculating concordance: 100%|██████████| 5/5 [00:22<00:00,  4.51s/it]
Loading graph: 100%|██████████| 52182/52182 [00:00<00:00, 358597.00it/s]
100%|██████████| 10220/10220 [49:03<00:00,  3.47it/s] 
Calculating concordance: 100%|██████████| 5/5 [03:01<00:00, 36.26s/it]
Loading graph: 100%|██████████| 48878/48878 [00:00<00:00, 345137.39it/s]


l1000_geo
### l1000-geo ###


100%|██████████| 14472/14472 [13:36<00:00, 17.73it/s] 
Calculating concordance: 100%|██████████| 5/5 [00:56<00:00, 11.31s/it]
Loading graph: 100%|██████████| 52182/52182 [00:00<00:00, 398025.71it/s]
100%|██████████| 4964/4964 [28:39<00:00,  2.89it/s]  
Calculating concordance: 100%|██████████| 5/5 [05:00<00:00, 60.17s/it]
