# Path analysis

This notebook contains the statistical analysis of pathways for each lmax in each of the dataset pairs

# Imports

In [2]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm
from collections import Counter
from itertools import product
import matplotlib.pyplot as plt

from utils import DATA_DIR, filter_dataset

# Convert dict to dataframe

In [3]:
df_dict = {}

In [None]:
for file in tqdm(os.listdir(os.path.join(DATA_DIR, 'lmax-pairs'))):
    print(file)
    if file in ['l1000_geo-lmax_7-openbio.json', 'l1000_target-lmax_7-openbio.json']:
        continue
    
    if 'lmax_8' in file or 'lmax_3' in file or 'lmax_5' in file or 'lmax_4' in file or 'lmax_6' in file:
        continue
    
    with open(os.path.join(DATA_DIR, 'lmax-pairs', file)) as f:
        data_dict = json.load(f)

    if 'lmax_3' in df_dict and 'lmax_3' in file:
        df = df_dict['lmax_3']

    elif 'lmax_4' in df_dict and 'lmax_4' in file:
        df = df_dict['lmax_4']

    elif 'lmax_5' in df_dict and 'lmax_5' in file:
        df = df_dict['lmax_5']

    elif 'lmax_6' in df_dict and 'lmax_6' in file:
        df = df_dict['lmax_6']
        

    elif 'lmax_7' in df_dict and 'lmax_7' in file:
        df = df_dict['lmax_7']

    elif 'lmax_8' in df_dict and 'lmax_8' in file:
        df = df_dict['lmax_8']

    else:
        df = pd.DataFrame(columns=[
            'source', 
            'target', 
            'number_of_paths', 
            'subgraph_name',
            'paths',
            'signs'
        ])

    if 'custom' in file:
        subgraph_name = 'custom'
    elif 'openbio' in file:
        subgraph_name = 'openbio'

    for el in data_dict:
        if el:

            tmp_paths = []
            tmp_signs = []
            
            if len(el['paths']) > 0:
                for l, v in el['paths'].items():
                    p = []
                    ps = []
                    for k in v:
                        if k == '-|':
                            ps.append(-1)
                        elif k == '->':
                            ps.append(1)
                        else:
                            p.append(k)
                    tmp_paths.append(p)
                    tmp_signs.append(ps)

                tmp_dict = {
                    'source': [el['source']],
                    'target': [el['target']], 
                    'number_of_paths': [len(el['paths'])], 
                    'subgraph_name': [subgraph_name],
                    'paths': [tmp_paths],
                    'signs': [tmp_signs]
                }
            else:
                tmp_dict = {
                    'source': [el['source']],
                    'target': [el['target']], 
                    'number_of_paths': [len(el['paths'])], 
                    'subgraph_name': [subgraph_name],
                    'paths': [''],
                    'signs': ['']
                }

            tmp_df = pd.DataFrame.from_dict(tmp_dict)
            df = pd.concat([df, tmp_df], ignore_index=True)

    if 'lmax_3' in file:
        df_dict['lmax_3'] = df
    elif 'lmax_4' in file:
        df_dict['lmax_4'] = df
    elif 'lmax_5' in file:
        df_dict['lmax_5'] = df
    elif 'lmax_6' in file:
        df_dict['lmax_6'] = df
    elif 'lmax_7' in file:
        df_dict['lmax_7'] = df
    else:
        df_dict['lmax_8'] = df

  0%|          | 0/40 [00:00<?, ?it/s]

l1000_target-lmax_6-openbio.json
creed_target-lmax_3-custom.json
l1000_target-lmax_3-custom.json
creed_geo-lmax_3-custom.json
creed_target-lmax_6-openbio.json
l1000_geo-lmax_6-openbio.json
l1000_target-lmax_7-openbio.json
creed_geo-lmax_6-custom.json
l1000_geo-lmax_7-openbio.json
creed_target-lmax_7-openbio.json


 25%|██▌       | 10/40 [01:55<05:46, 11.55s/it]

l1000_target-lmax_6-custom.json
creed_target-lmax_6-custom.json
l1000_geo-lmax_4-custom.json
creed_target-lmax_5-custom.json
l1000_target-lmax_5-custom.json
l1000_geo-lmax_7-custom.json


In [8]:
df_dict

{'lmax_7':                          source         target number_of_paths subgraph_name  \
 0         pubchem.compound:5152  mondo:0018874             278       openbio   
 1         pubchem.compound:5152  mondo:0006639              62       openbio   
 2         pubchem.compound:5152  mondo:0004975               2       openbio   
 3         pubchem.compound:5152  mondo:0010311               0       openbio   
 4         pubchem.compound:5152  mondo:0004985               9       openbio   
 ...                         ...            ...             ...           ...   
 52255  pubchem.compound:5328940  mondo:0021636            1696       openbio   
 52256  pubchem.compound:5328940  mondo:0005105            4467       openbio   
 52257  pubchem.compound:5328940  mondo:0008315             939       openbio   
 52258  pubchem.compound:5328940  mondo:0004948             735       openbio   
 52259  pubchem.compound:5328940  mondo:0005061            2098       openbio   
 
                

# Saving lmax df to files

In [None]:
if not os.path.exists(os.path.join(DATA_DIR, 'lmax-dfs')):
    os.mkdir(os.path.join(DATA_DIR, 'lmax-dfs'))
    
for lmax in df_dict:
    df_dict[lmax].to_csv(
        os.path.join(DATA_DIR, 'lmax-dfs', f'{lmax}-data.tsv'), 
        sep='\t',
        index=False
    )

# Path statistic analysis

In [None]:
count_dict = {
    'lmax_3': {},
    'lmax_4': {}, 
    'lmax_5': {}, 
    'lmax_6': {}, 
    'lmax_7': {}, 
    'lmax_8': {}
}
for lmax in df_dict:  
    k = df_dict[lmax]
    m = k['number_of_paths'].value_counts().to_dict()

    for k, v in m.items():
        if k in count_dict[lmax]:
            count_dict[lmax][k] += v
        else:
            count_dict[lmax][k] = v

In [None]:
for lmax in count_dict:
    if count_dict[lmax]:
        print(f'\n #### {lmax} ####')
        try:
            no_paths = count_dict[lmax][0]
        except KeyError:
            no_paths = 0
            
        pairs = list(count_dict[lmax].values())

        print(f'No path found for {no_paths} pairs')

        print(f'Percentage of no paths - {round((no_paths/sum(pairs)) * 100, 2)}')


        m = {k: v for k, v in sorted(count_dict[lmax].items()) if k != 0}
        print('\nNumber of path statitstics')
        paths = list(m.keys())
        print(f'Maximum - {paths[-1]}, Minimum - {paths[0]}, Mean - {round(sum(paths)/ len(paths), 2)}')

        pairs = list(m.values())
        print('\nNumber of pairs statitstics')
        print(f'Maximum - {max(pairs)}, Minimum - {min(pairs)}, Mean - {round(sum(pairs)/ len(pairs), 2)}')

        max_paths = [k for k,v in m.items() if v == max(pairs)]
        min_paths = [k for k,v in m.items() if v == min(pairs)]

        print(f'Maximum pair count for path - {len(max_paths)}')
        print(f'Minimum pair count for path - {len(min_paths)}')
    
    

 #### lmax_3 ####
No path found for 74658 pairs
Percentage of no paths - 97.54

Number of path statitstics
Maximum - 3, Minimum - 1, Mean - 2.0

Number of pairs statitstics
Maximum - 1779, Minimum - 12, Mean - 628.67
Maximum pair count for path - 1
Minimum pair count for path - 1

 #### lmax_4 ####
No path found for 70004 pairs
Percentage of no paths - 91.46

Number of path statitstics
Maximum - 48, Minimum - 1, Mean - 16.74

Number of pairs statitstics
Maximum - 4018, Minimum - 1, Mean - 210.97
Maximum pair count for path - 1
Minimum pair count for path - 4

#### lmax_5 ####
No path found for 57882 pairs
Percentage of no paths - 75.62

Number of path statitstics
Maximum - 943, Minimum - 1, Mean - 183.39

Number of pairs statitstics
Maximum - 4899, Minimum - 1, Mean - 68.86
Maximum pair count for path - 1
Minimum pair count for path - 69

 #### lmax_6 ####
No path found for 43619 pairs
Percentage of no paths - 56.99

Number of path statitstics
Maximum - 25621, Minimum - 1, Mean - 1681.37

Number of pairs statitstics
Maximum - 3389, Minimum - 1, Mean - 23.65
Maximum pair count for path - 1
Minimum pair count for path - 412
