# Transcriptomics-specific Analysis

This notebook contains creation of all chemical-disease pairs in our subgraphs based on specific-transcriptomic data

# Pre-requirements

1. Installation of drug2ways
1. Running of earlier notebook (notebook 2,3, and 5) 

# Imports

In [None]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm
from itertools import product
from networkx import DiGraph

from utils import (get_paths, filter_dataset, 
                   get_transcriptomic_paths, create_graph_from_df,
                   get_path_count, DATA_DIR, KG_DATA_PATH)

In [None]:
logger = logging.getLogger(__name__)
logging.getLogger('drug2ways').setLevel(logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)

# Load dataset-generated network dataframe

In [None]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'),
    sep='\t'
)

custom_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'), 
    sep='\t'
)

# Load disease datasets

In [None]:
with open(os.path.join(DATA_DIR, 'transcriptomics', 'geo_harmonized_expression.json')) as file:
    geo_dict = json.load(file)

with open(os.path.join(DATA_DIR, 'transcriptomics', 'target_harmonized_expression.json')) as file2:
    open_target_dict = json.load(file2)

# Filterting disease dataset based on network

In [None]:
# GEO
geo_openbio = filter_dataset(dataset=geo_dict, graph_df=openbiolink_df)
geo_custom = filter_dataset(dataset=geo_dict, graph_df=custom_df)

geo_dict = {'openbio': geo_openbio, 'custom': geo_custom}

# OpenTarget
target_openbio = filter_dataset(dataset=open_target_dict, graph_df=openbiolink_df)
target_custom = filter_dataset(dataset=open_target_dict, graph_df=custom_df)

open_target_dict = {'openbio': target_openbio, 'custom': target_custom}

# Load clinical and drug-indication data

In [None]:
with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-clinical-pairs.json')) as file:
    clinical_pair_dict = json.load(file).keys()

# Creating information dict for each chemical-disease pair

In [None]:
MAP = {
    'target': open_target_dict,
    'geo': geo_dict,
}

In [None]:
def main():
    df = pd.DataFrame(columns=[
            'source',
            'target',
            'number_of_paths',
            'number_of_concordant_paths',
            'in_clinical_trial',
            'number_of_concordant_activatory_paths',
            'number_of_concordant_inhibitory_paths',
            'subgraph_size',
            'number_of_unique_nodes',
#             'lmax',
            'subgraph_name',
        ])

    if not os.path.exists(os.path.join(DATA_DIR, 'concordant_paths')):
        os.mkdir(os.path.join(DATA_DIR, 'concordant_paths'))


    for gname in ['openbio', 'custom']:

        if gname == 'openbio':
            kg_df = openbiolink_df
        else:
            kg_df = custom_df

        graph_copy = create_graph_from_df(kg_df)
        graph = graph_copy.copy()

        # Get protein nodes
        protein_nodes = list(node for node in graph.nodes() if 'ncbigene' in node)

        paths = get_protein_paths(
            graph=graph,
            protein_list=protein_nodes,
            lmax=4,  # Just accepts one value, change in Utils if needed.
        )

        for lmax, path_list in tqdm(paths.items(), desc='Calculating concordance'):
            for p in path_list:
                if len(p['paths']) > 0:
                    
                    # Just get the nodes from the path
                    tmp_paths = []

                    for v, l in p['paths'].items():
                        pth = []
                        for k in l:
                            if k in ['-|', '->']:
                                continue
                            else:
                                pth.append(k)
                        tmp_paths.append(pth)

                    protein = p['source']
                    disease = p['target']

                    for disease_dict in MAP:
                        if disease not in disease_dict[gname]:
                                continue

                        results = get_transcriptomic_paths(
                            directed_graph=graph,
                            source=chemical,
                            target=disease,
                            all_paths=tmp_paths,
                            disease_dict=disease_dict[gname][disease],
                            clinical_pair_dict=clinical_pair_dict,
                        )


                        concordant_num = len(results[i])
                        if concordant_num != 0:
                            activated_paths, inhibited_paths = get_path_count(
                                directed_graph=graph,
                                filtered_paths=results[i]
                            )

                            new_results = {
                                'source': results['source'],
                                'target': results['target'],
                                'number_of_paths': results['number_of_paths'],
                                'number_of_concordant_paths': concordant_num,
                                'in_clinical_trial': results['in_clinical_trial'],
                                'number_of_concordant_activatory_paths': activated_paths,
                                'number_of_concordant_inhibitory_paths': inhibited_paths,
                                'subgraph_size': results['subgraph_size'],
                                'number_of_unique_nodes': results['number_of_unique_nodes'],
                                'subgraph_name': gname, # TODO: If changing lmax to range, return lmax here too
                            }

                            tmp_df = pd.DataFrame(new_results, index=[0])
                            df = pd.concat([df, tmp_df], ignore_index=True)

    n_file_path = os.path.join(DATA_DIR, 'concordant_paths', f'{graph_name}-{i}.tsv')
    val.to_csv(n_file_path, sep='\t', index=False)    