# Analysis of pairs

This notebook summarizesz the statistical analysis of the clinical trial and KG data.

In [1]:
!pip install scipy



In [2]:
import os
import pandas as pd
from scipy import stats

from collections import Counter, defaultdict
from networkx import DiGraph, connected_components

from utils import DATA_DIR, KG_DATA_PATH

In [3]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_network_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)
custom_network_df.rename(columns={'relation': 'polarity'}, inplace=True)

# Filtering based on presence of pair in 2/3 lmax

In [4]:
def filter_by_lmax(
    df: pd.DataFrame,
    by_lmax: list
):
    tmp_df = df.copy(deep=True)
    tmp_df['val'] = tmp_df['source'] + '-' + tmp_df['target']
    
    new_df = pd.DataFrame(columns=tmp_df.columns)
    
    for l in by_lmax:
        fil_df = tmp_df.loc[tmp_df['lmax'] == f'lmax_{l}']
        new_df = pd.concat([new_df, fil_df], ignore_index=True)
    
    return new_df

In [5]:
def filter_by_node_count(
    df: pd.DataFrame,
    by_node_count: int
):
    tmp_df = df.copy(deep=True)
    tmp_df['val'] = tmp_df['source'] + '-' + tmp_df['target']
    tmp_df = tmp_df.loc[tmp_df['number_of_unique_nodes'] > by_node_count]
    return tmp_df

In [6]:
def filter_by_path_occurence(
    df: pd.DataFrame,
    filter_num: int
):
    tmp_df = df.copy(deep=True)
    tmp_df['val'] = tmp_df['source'] + '-' + tmp_df['target']

    tmp_df = tmp_df[['val', 'subgraph_name', 'lmax']]

    # Collect data for each KG individually
    custom_data = tmp_df.loc[tmp_df['subgraph_name'] == 'custom'].drop(
        'subgraph_name',
        axis=1,
    )

    custom_pairs = {}

    for pair, lmax in custom_data.values:
        if pair not in custom_pairs:
            custom_pairs[pair] = set()
        custom_pairs[pair].add(lmax)

    opentarget_data = tmp_df.loc[tmp_df['subgraph_name'] == 'openbio'].drop(
        'subgraph_name',
        axis=1
    )

    open_pairs = {}

    for pair, lmax in opentarget_data.values:
        if pair not in open_pairs:
            open_pairs[pair] = set()
        open_pairs[pair].add(lmax)

    # Filter data in DF
    index_list = []

    for key, val in custom_pairs.items():
        if len(val) > filter_num:
            df1 = tmp_df.loc[(tmp_df['val'] == key) & (tmp_df['subgraph_name'] == 'custom')]
            index_list.extend(df1.index.tolist())

    for key, val in open_pairs.items():
        if len(val) > filter_num:
            df1 = tmp_df.loc[(tmp_df['val'] == key) & (tmp_df['subgraph_name'] == 'openbio')]
            index_list.extend(df1.index.tolist())

    index_to_keep = sorted(list(set(index_list)))

    new_df = df[df.index.isin(index_to_keep)]

    print(f'Reduced dataframe from {df.shape} to {new_df.shape}')
    return new_df

In [7]:
def filter_df(
    df_list: list, 
    filter_num: int = 0, 
    node_count: int = 0,
    lmax_list: list = []
):     
    final_df = []
    
    for df in df_list:
        current_df = df
        if len(lmax_list) > 0:
            current_df = filter_by_lmax(current_df, by_lmax=lmax_list)
            
        if node_count != 0:
            current_df = filter_by_node_count(current_df, by_node_count=node_count)
                
        if filter_num != 0:
            current_df = filter_by_path_occurence(current_df, filter_num)
        
        final_df.append(current_df)
            
    return final_df

In [26]:
def cal_trial_info(df_list, trial_value_dict, data_col):
    info_df = pd.DataFrame(columns=[
        'dataset',
        'precision',
        'value by chance',
        'pval',
        'pair_count',
        'subgraph'
    ])
    for df in df_list:
        for gname in ['openbio', 'custom']:
            if df.empty:
                continue
            new_df = df.loc[df['subgraph_name'] == gname]
            
            if new_df.empty:
                data_set = df['dataset'].unique().tolist()[0]
                trials = ''
                percent = ''
            else:
                data_set = new_df['dataset'].unique().tolist()[0]
                trials = new_df[data_col].tolist()
                percent = trials.count(True)/len(trials) * 100
            
            gdict = trial_value_dict[gname]
            
            # Get binomial p-value
            true_positive = trials.count(True)
            all_positives = len(trials)
            trial_val = round(gdict[data_set], 3) / 100
            
            
            pval = stats.binom_test(
                true_positive, 
                n=all_positives, 
                p=trial_val, 
                alternative='greater'
            )

            tmp = pd.DataFrame({
                'dataset': data_set,
                'subgraph': gname,
                'precision': round(percent, 3) if type(percent) == float else percent, 
                'value by chance': round(gdict[data_set], 3),
                'binomial_pval': pval,
                'pair_count': str(all_positives)
            }, index=[0])
            info_df = pd.concat([info_df, tmp], ignore_index=True)
    return info_df

# Precision values for each gold standard data

In [27]:
clinical_trial_value_dict = {
    'openbio':  {
        'l1000_geo': 3.20311,
        'creed_geo': 9.36533,
        'l1000_target': 1.73482,
        'creed_target': 4.44805
    },
    'custom': {
        'l1000_geo': 7.85035,
        'creed_geo': 15.9502,
        'l1000_target': 5.04384,
        'creed_target': 10
    }
}

indication_trial_val_dict = {
    'openbio': {
        'l1000_geo': 0.21004,
        'creed_geo': 1.23839,
        'l1000_target': 0.165221,
        'creed_target': 0.551948
    },
    'custom': {
        'l1000_geo': 1.44896,
        'creed_geo': 2.60181,
        'l1000_target': 0.866426,
        'creed_target': 1.31868
    }
}

def main():
    for dir_name in ['0.0', '0.5', '1.0']:
        print(f'\n {dir_name}')
        k = []
        for file in os.listdir(os.path.join(DATA_DIR, 'concordant_paths', dir_name)):
            if file.endswith('-both.tsv'):
                df = pd.read_csv(
                    os.path.join(DATA_DIR, 'concordant_paths', dir_name, file),
                    sep='\t'
                )

                data_set = file.split('-')[0]
                df['dataset'] = data_set

                k.append(df)
                
        filtered_df = filter_df(df_list=k)
        
        for i in ['in_clinical_trial', 'in_drug_indication']:
             print(f'\n####{i}####')
             if i == 'in_clinical_trial':
        trial_dict = clinical_trial_value_dict
             else:
                 trial_dict = indication_trial_val_dict
        m = cal_trial_info(df_list=filtered_df, trial_value_dict=trial_dict, data_col=i)
        print(m)

In [30]:
def main():
    for dir_name in ['0.0', '0.5', '1.0']:
        print(f'\n {dir_name}')
        k = []
        for file in os.listdir(os.path.join(DATA_DIR, 'concordant_paths', dir_name)):
            if file.endswith('-both.tsv'):
                df = pd.read_csv(
                    os.path.join(DATA_DIR, 'concordant_paths', dir_name, file),
                    sep='\t'
                )

                data_set = file.split('-')[0]
                df['dataset'] = data_set

                k.append(df)
                
        filtered_df = filter_df(df_list=k)

        trial_dict = clinical_trial_value_dict
        
        m = cal_trial_info(df_list=filtered_df, trial_value_dict=trial_dict, data_col='in_clinical_trial')
        print(m)

In [31]:
main()


 0.0


Unnamed: 0,dataset,precision,value by chance,pval,pair_count,subgraph
0,l1000_geo,2.032,3.203,0.993142,1132,openbio
1,l1000_geo,8.381,7.85,0.347994,525,custom
2,creed_target,9.091,4.448,0.097161,55,openbio
3,creed_target,0.0,10.0,1.0,30,custom
4,l1000_target,2.033,1.735,0.201698,1623,openbio
5,l1000_target,8.319,5.044,0.000662,565,custom
6,creed_geo,10.0,9.365,0.508377,50,openbio
7,creed_geo,21.739,15.95,0.300382,23,custom
