# Analysis of drug-disease pairs

This notebook summarizes the results presented in the paper and contains several methods to filter the results according to users' needs.

In [1]:
import os
import json
import pandas as pd

from collections import Counter, defaultdict
from networkx import DiGraph, connected_components

from utils import DATA_DIR

#### Filtering functions (not used)

In [2]:
def filter_by_lmax(
    df: pd.DataFrame,
    by_lmax: list
):
    """Filter dataframe by specific lmax. """
    tmp_df = df.copy(deep=True)
    
    new_df = pd.DataFrame(columns=tmp_df.columns)
    
    for l in by_lmax:
        fil_df = tmp_df.loc[tmp_df['lmax'] == f'lmax_{l}']
        new_df = pd.concat([new_df, fil_df], ignore_index=True)
    
    return new_df

In [3]:
def filter_by_path_occurence(
    df: pd.DataFrame,
    filter_num: int
):
    """Filter dataframe based on number of times the pair appears in. """
    tmp_df = df.copy(deep=True)

    tmp_df = tmp_df[['pairs', 'subgraph_name', 'lmax']]

    # Collect data for each KG individually
    custom_data = tmp_df.loc[tmp_df['subgraph_name'] == 'custom'].drop(
        'subgraph_name',
        axis=1,
    )

    custom_pairs = {}

    for pair, lmax in custom_data.values:
        if pair not in custom_pairs:
            custom_pairs[pair] = set()
        custom_pairs[pair].add(lmax)

    opentarget_data = tmp_df.loc[tmp_df['subgraph_name'] == 'openbio'].drop(
        'subgraph_name',
        axis=1
    )

    open_pairs = {}

    for pair, lmax in opentarget_data.values:
        if pair not in open_pairs:
            open_pairs[pair] = set()
        open_pairs[pair].add(lmax)

    # Filter data in DF
    index_list = []

    for key, val in custom_pairs.items():
        if len(val) > filter_num:
            df1 = tmp_df.loc[(tmp_df['pairs'] == key) & (tmp_df['subgraph_name'] == 'custom')]
            index_list.extend(df1.index.tolist())

    for key, val in open_pairs.items():
        if len(val) > filter_num:
            df1 = tmp_df.loc[(tmp_df['pairs'] == key) & (tmp_df['subgraph_name'] == 'openbio')]
            index_list.extend(df1.index.tolist())

    index_to_keep = sorted(list(set(index_list)))

    new_df = df[df.index.isin(index_to_keep)]

    print(f'Reduced dataframe from {df.shape} to {new_df.shape}')
    return new_df

In [4]:
def filter_by_node_count(
    df: pd.DataFrame,
    min_count: int,
    max_count: int = 0
):
    """Filter dataframe according to the number of unique nodes between drug-disease pair. """
    tmp_df = df.copy(deep=True)
    
    if max_count == 0:
        return tmp_df.loc[tmp_df['number_of_unique_nodes'] > min_count]
    else:
        return tmp_df.loc[tmp_df['number_of_unique_nodes'].between(min_count, max_count)]

In [5]:
def filter_by_concordant_path(
    df: pd.DataFrame,
    min_count: int,
    max_count: int = 0
):
    """Filtere dataframe based on number of concordant_paths between drug-disease pair. """
    tmp_df = df.copy(deep=True)

    if max_count == 0:
        return tmp_df.loc[tmp_df['number_of_concordant_paths'] > min_count]
    else:
        return tmp_df.loc[tmp_df['number_of_concordant_paths'].between(min_count, max_count)]

#### Filter function main

In [6]:
def filter_df(
    df_list: list, 
    filter_num: int = 0, 
    min_node_count: int = 0,
    max_node_count: int = 0,
    max_path_count:int = 0,
    min_path_count:int = 0,
    lmax_list: list = []
):     
    final_df = []
    
    for df in df_list:
        current_df = df
        if len(lmax_list) > 0:
            current_df = filter_by_lmax(current_df, by_lmax=lmax_list)
            
        if min_node_count != 0:
            current_df = filter_by_node_count(
                current_df,
                min_count=min_node_count,
                max_count=max_node_count
            )
            
        if min_path_count != 0:
            current_df = filter_by_concordant_path(
                current_df, 
                max_count=max_path_count, 
                min_count=min_path_count
            )
                
        if filter_num != 0:
            current_df = filter_by_path_occurence(current_df, filter_num)
        
        final_df.append(current_df)
            
    return final_df

#### Precision that can be achieved by chance for each gold standard data (From Notebook 5.2)

In [7]:
clinical_dict = {
    'openbio':  {
        'creed_target': 26.1649,
        'creed_geo': 36.7742,
        'lc1000_target': 11.3169,
        'lc1000_geo': 15.6614,
    },
    'custom': {
        'creed_target': 24.359,
        'creed_geo': 33.5294,
        'lc1000_target': 9.23336,
        'lc1000_geo': 12.8342,
    }
}

#### Calculate precision

In [8]:
def cal_trial_info(df_list, trial_value_dict, data_col):
    
    info_df = pd.DataFrame(columns=[
        'dataset',
        'subgraph',
        'precision',
        'value by chance',
        'pair_count'
    ])
    
    for df in df_list:
        for gname in ['openbio', 'custom']:
            if df.empty:
                continue
                
            new_df = df.loc[df['subgraph_name'] == gname]
            
            if new_df.empty:
                data_set = df['dataset'].unique().tolist()[0]
                trial_dict = {True: set(), False: set()}
                percent = ''
            else:
                data_set = new_df['dataset'].unique().tolist()[0]
                
                trial_dict = {True: set(), False: set()}
                for bool_val, pair in new_df[[data_col, 'pairs']].values:
                    trial_dict[bool_val].add(pair)
                
                # Convert back to list
                trial_dict[True] = list(trial_dict[True])
                trial_dict[False] = list(trial_dict[False])
                
                percent = len(trial_dict[True])/(len(trial_dict[True]) + len(trial_dict[False])) * 100
            
            gdict = trial_value_dict[gname]
                        
            # Get binomial p-value
            true_positive = len(trial_dict[True])
            all_positives = (len(trial_dict[True]) + len(trial_dict[False]))
            trial_val = round(gdict[data_set], 3) / 100
            
            tmp = pd.DataFrame({
                'dataset': data_set,
                'subgraph': gname,
                'precision': round(percent, 3) if type(percent) == float else percent, 
                'value by chance': round(gdict[data_set], 3),
                'pair_count': f'{true_positive}/{all_positives}'
            }, index=[0])
            info_df = pd.concat([info_df, tmp], ignore_index=True)
            
    return info_df

In [9]:
def main(file_ending: str, trial_dict: dict, col_name: str):
    k = []
    
    for file in os.listdir(os.path.join(DATA_DIR, 'concordant_paths')):
        if file.endswith(file_ending):
            df = pd.read_csv(
                os.path.join(DATA_DIR, 'concordant_paths', file),
                sep='\t'
            )

            data_set = file.split('-')[0]
            df['dataset'] = data_set

            df['pairs'] = df['source'] + '_' + df['target']

            k.append(df)
            
    filtered_df = filter_df(df_list=k)
    
    m = cal_trial_info(
        df_list=filtered_df, 
        trial_value_dict=trial_dict, 
        data_col=col_name
    )
    return m

In [10]:
m = main(
    file_ending='-results.tsv',
    trial_dict=clinical_dict,
    col_name='in_clinical_trial'
)
m

Unnamed: 0,dataset,subgraph,precision,value by chance,pair_count
0,lc1000_geo,openbio,80.0,15.661,4/5
1,lc1000_geo,custom,66.667,12.834,2/3
2,lc1000_target,openbio,54.545,11.317,6/11
3,lc1000_target,custom,50.0,9.233,2/4
4,creed_geo,openbio,50.0,36.774,1/2
5,creed_geo,custom,50.0,33.529,1/2
6,creed_target,openbio,50.0,26.165,1/2
7,creed_target,custom,0.0,24.359,0/1
