# Analysis of pairs

This notebook summarizesz the statistical analysis of the clinical trial and KG data.

In [1]:
import os
import pandas as pd
from utils import DATA_DIR, KG_DATA_PATH
from collections import Counter, defaultdict
from networkx import DiGraph, connected_components

In [2]:
openbiolink_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'openbiolink_kg_normalized.tsv'),
    sep='\t'
)
openbiolink_df.rename(columns={'relation': 'polarity'}, inplace=True)

custom_network_df = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'normalized', 'custom_kg_normalized.tsv'),
    sep='\t'
)
custom_network_df.rename(columns={'relation': 'polarity'}, inplace=True)

In [3]:
k = []
for file in os.listdir(os.path.join(DATA_DIR, 'concordant_paths', 'data-1')):
    if file.endswith('-0.5.tsv'):
        df = pd.read_csv(
            os.path.join(DATA_DIR, 'concordant_paths', 'data-2', file),
            sep='\t'
        )

        df = df[df['number_of_concordant_paths'] > 0]

        data_set = file.split('-')[0]
        df['dataset'] = data_set

        k.append(df)

# Filtering based on presence of pair in 2/3 lmax

In [4]:
def filter_by_lmax(
    df: pd.DataFrame,
    by_lmax: list
):
    tmp_df = df.copy(deep=True)
    tmp_df['val'] = tmp_df['source'] + '-' + tmp_df['target']
    
    new_df = pd.DataFrame(columns=tmp_df.columns)
    
    for l in by_lmax:
        fil_df = tmp_df.loc[tmp_df['lmax'] == f'lmax_{l}']
        new_df = pd.concat([new_df, fil_df], ignore_index=True)
    
    return new_df

In [5]:
def filter_by_node_count(
    df: pd.DataFrame,
    by_node_count: int
):
    tmp_df = df.copy(deep=True)
    tmp_df['val'] = tmp_df['source'] + '-' + tmp_df['target']
    tmp_df = tmp_df.loc[tmp_df['number_of_unique_nodes'] > by_node_count]
    return tmp_df

In [6]:
def filter_by_path_occurence(
    df: pd.DataFrame,
    filter_num: int
):
    tmp_df = df.copy(deep=True)
    tmp_df['val'] = tmp_df['source'] + '-' + tmp_df['target']

    tmp_df = tmp_df[['val', 'subgraph_name', 'lmax']]

    # Collect data for each KG individually
    custom_data = tmp_df.loc[tmp_df['subgraph_name'] == 'custom'].drop(
        'subgraph_name',
        axis=1,
    )

    custom_pairs = {}

    for pair, lmax in custom_data.values:
        if pair not in custom_pairs:
            custom_pairs[pair] = set()
        custom_pairs[pair].add(lmax)

    opentarget_data = tmp_df.loc[tmp_df['subgraph_name'] == 'openbio'].drop(
        'subgraph_name',
        axis=1
    )

    open_pairs = {}

    for pair, lmax in opentarget_data.values:
        if pair not in open_pairs:
            open_pairs[pair] = set()
        open_pairs[pair].add(lmax)

    # Filter data in DF
    index_list = []

    for key, val in custom_pairs.items():
        if len(val) > filter_num:
            df1 = tmp_df.loc[(tmp_df['val'] == key) & (tmp_df['subgraph_name'] == 'custom')]
            index_list.extend(df1.index.tolist())

    for key, val in open_pairs.items():
        if len(val) > filter_num:
            df1 = tmp_df.loc[(tmp_df['val'] == key) & (tmp_df['subgraph_name'] == 'openbio')]
            index_list.extend(df1.index.tolist())

    index_to_keep = sorted(list(set(index_list)))

    new_df = df[df.index.isin(index_to_keep)]

    print(f'Reduced dataframe from {df.shape} to {new_df.shape}')
    return new_df

In [7]:
def filter_df(
    df_list: list, 
    filter_num: int = 0, 
    node_count: int = 0,
    lmax_list: list = []
):     
    final_df = []
    
    for df in df_list:
        current_df = df
        if len(lmax_list) > 0:
            print('Here')
            current_df = filter_by_lmax(current_df, by_lmax=lmax_list)
            
        if node_count != 0:
            print('There')
            current_df = filter_by_node_count(current_df, by_node_count=node_count)
                
        if filter_num != 0:
            print('Nowhere')
            current_df = filter_by_path_occurence(current_df, filter_num)
        
        final_df.append(current_df)
            
    return final_df

# Percentage in clinical trial

In [8]:
openbio_dict = {
    'l1000_geo': 3.203108591,
    'creed_geo': 9.365325077,
    'l1000_target': 1.734820322,
    'creed_target': 4.448051948
}

custom_dict = {
    'l1000_geo': 7.850346021,
    'creed_geo': 15.95022624,
    'l1000_target': 5.043837029,
    'creed_target': 10
}

In [9]:
def cal_trial_info(df_list):
    info_df = pd.DataFrame(columns=['dataset', 'trial %', 'value by chance'])
    for df in df_list:
        for gname in ['openbio', 'custom']:
            if df.empty:
                continue
            new_df = df.loc[df['subgraph_name'] == gname]
            
            if new_df.empty:
                data_set = df['dataset'].unique().tolist()[0]
                trials = ''
                percent = ''
            else:
                data_set = new_df['dataset'].unique().tolist()[0]
                trials = new_df['in_clinical_trial'].tolist()
                percent = trials.count(True)/len(trials) * 100
            
            if gname == 'openbio':
                gdict = openbio_dict
            else:
                gdict = custom_dict

            tmp = pd.DataFrame({
                'dataset': data_set,
                'subgraph': gname,
                'trial %': round(percent, 3) if type(percent) == float else percent, 
                'value by chance': round(gdict[data_set], 3),
                'pair_count': str(new_df.shape[0])
            }, index=[0])
            info_df = pd.concat([info_df, tmp], ignore_index=True)
    return info_df

In [10]:
filter_df = filter_df(df_list=k)

In [11]:
cal_trial_info(df_list=filter_df)

Unnamed: 0,dataset,trial %,value by chance,subgraph,pair_count
0,creed_geo,10.0,9.365,openbio,20
1,creed_geo,25.0,15.95,custom,8
2,creed_target,8.333,4.448,openbio,24
3,creed_target,0.0,10.0,custom,12
4,l1000_geo,2.033,3.203,openbio,1033
5,l1000_geo,7.397,7.85,custom,365
6,l1000_target,1.93,1.735,openbio,1451
7,l1000_target,7.276,5.044,custom,646
