# Transcriptomics-specific Analysis

This notebook contains creation of all chemical-disease pairs in our subgraphs based on specific-transcriptomic data

# Pre-requirements

1. Installation of drug2ways
1. Running of earlier notebook (notebook 2,3, and 5) 

# Imports

In [None]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm
from itertools import product
from networkx import DiGraph

from utils import (get_paths, filter_dataset, 
                   get_transcriptomic_paths, create_graph_from_df,
                   get_path_count, DATA_DIR, KG_DATA_PATH)

In [None]:
logger = logging.getLogger(__name__)
logging.getLogger('drug2ways').setLevel(logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)

# Load dataset-generated network dataframe

In [None]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'),
    sep='\t'
)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'), 
    sep='\t'
)

# Load datasets

In [None]:
with open(os.path.join(DATA_DIR, 'transcriptomics', 'harmonized_expression.json')) as file:
    creed_dict = json.load(file)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'harmonized_expression.json')) as file2:
    geo_dict = json.load(file2)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'harmonized_expression.json')) as file3:
    lc1000_dict = json.load(file3)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'harmonized_expression.json')) as file4:
    open_target_dict = json.load(file4)

# Filterting dataset based on network

In [None]:
# CREED
creed_openbio = filter_dataset(dataset=creed_dict, graph_df=openbiolink_df)
creed_custom = filter_dataset(dataset=creed_dict, graph_df=custom_df)

creed_dict = {'openbio': creed_openbio, 'custom': creed_custom}

# GEO
geo_openbio = filter_dataset(dataset=geo_dict, graph_df=openbiolink_df)
geo_custom = filter_dataset(dataset=geo_dict, graph_df=custom_df)

geo_dict = {'openbio': geo_openbio, 'custom': geo_custom}

# OpenTarget
target_openbio = filter_dataset(dataset=open_target_dict, graph_df=openbiolink_df)
target_custom = filter_dataset(dataset=open_target_dict, graph_df=custom_df)

open_target_dict = {'openbio': target_openbio, 'custom': target_custom}

# L1000
lc1000_openbio = filter_dataset(dataset=lc1000_dict, graph_df=openbiolink_df)
lc1000_custom = filter_dataset(dataset=lc1000_dict, graph_df=custom_df)

lc1000_dict = {'openbio': lc1000_openbio, 'custom': lc1000_custom}


# Load clinical and drug-indication data

In [None]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-clinical-pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

# Creating information dict for each chemical-disease pair

In [None]:
MAP = {
    'creed' : creed_dict,
    'target': open_target_dict,
    'geo': geo_dict,
    'lc1000': lc1000_dict,
}

In [None]:
for c, d in product(['creed', 'lc1000'], ['target', 'geo']):
    c_set = MAP[c]
    d_set = MAP[d]
    graph_name = c + '_' + d
    print(graph_name)
    
    print(f'### {c}-{d} ###')
    
    df = pd.DataFrame(columns=[
        'source',
        'target',
        'number_of_paths',
        'number_of_concordant_paths',
        'in_clinical_trial',
        'number_of_concordant_activatory_paths',
        'number_of_concordant_inhibitory_paths',
        'subgraph_size',
        'number_of_unique_nodes',
        'lmax',
        'subgraph_name',
    ])
    
    if not os.path.exists(os.path.join(DATA_DIR, 'concordant_paths')):
        os.mkdir(os.path.join(DATA_DIR, 'concordant_paths'))
    
    NAME = f'{graph_name}-.tsv'
    file_path = os.path.join(DATA_DIR, 'concordant_paths', NAME)
    
    if os.path.exists(file_path):
        continue
        
    # Iterating different chemical-disease pair
    final_data = {
        'drug': df,
        'disease': df
    }
        
    for gname in ['openbio', 'custom']:
        if gname == 'openbio':
            flag = True
            kg_df = openbiolink_df
        else:
            flag = False
            kg_df = custom_df
        
        paths = get_paths(
            graph_df=kg_df,
            disease_dict=d_set[gname],
            chemical_dict=c_set[gname],
            graph_name=graph_name,
            openbio=flag
        )
        
        if paths is None:
            continue
        
        graph_copy = create_graph_from_df(kg_df)
        graph = graph_copy.copy()

        for lmax, p_dict in tqdm(paths.items(), desc='Calculating concordance'):
            for p in p_dict:
                if len(p['paths']) > 0:
                    # Just get the nodes from the path
                    tmp_paths = []
                    for v, l in p['paths'].items():
                        pth = []
                        for k in l:
                            if k in ['-|', '->']:
                                continue
                            else:
                                pth.append(k)
                        tmp_paths.append(pth)

                    chemical = p['source']
                    disease = p['target']

                    results = get_transcriptomic_paths(
                        directed_graph=graph,
                        source=chemical,
                        target=disease,
                        all_paths=tmp_paths,
                        drug_dict=c_set[gname][chemical],
                        disease_dict=d_set[gname][disease],
                        clinical_pair_dict=clinical_pair_dict,
                        drug_indication_dict=indication_pair_dict,
                    )
                    
                    # For drug data
                    for i in ['drug_paths', 'disease_paths']:
                        concordant_num = len(results[i])
                        if concordant_num != 0:
                            activated_paths, inhibited_paths = get_path_count(
                                directed_graph=graph,
                                filtered_paths=results[i]
                            )
                            
                            new_results = {
                                'source': results['source'],
                                'target': results['target'],
                                'number_of_paths': results['number_of_paths'],
                                'number_of_concordant_paths': concordant_num,
                                'in_clinical_trial': results['in_clinical_trial'],
                                'number_of_concordant_activatory_paths': activated_paths,
                                'number_of_concordant_inhibitory_paths': inhibited_paths,
                                'subgraph_size': results['subgraph_size'],
                                'number_of_unique_nodes': results['number_of_unique_nodes'],
                                'lmax': lmax,
                                'subgraph_name': gname,
                            }

                            tmp_df = pd.DataFrame(new_results, index=[0])
                            if i == 'drug_paths':
                                final_data['drug'] = pd.concat(
                                    [final_data['drug'], tmp_df],
                                    ignore_index=True
                                )
                            else:
                                final_data['disease'] = pd.concat(
                                    [final_data['disease'], tmp_df],
                                    ignore_index=True
                                )
                        
    for i, val in final_data.items():
        n_file_path = os.path.join(DATA_DIR, 'concordant_paths', f'{graph_name}-{i}.tsv')
        val.to_csv(n_file_path, sep='\t', index=False)    