In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from rdkit.Chem import Crippen
from rdkit.Chem import PandasTools
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdmolfiles
from rdkit.Chem.AllChem import GetMorganGenerator
from rdkit import DataStructs


## 0. EDA

In [None]:
herg_df = pd.read_csv(<herg_file>, sep="\t")
clogd_df = pd.read_csv(<clogd_file>, sep="\t")
cad_df = pd.read_csv(<cad_file>, sep="\t")
cyp3a4_df = pd.read_csv(<cyp3a4_file>, sep="\t")

lck_df = pd.read_csv(<lck_file>, sep="\t")
jak2_df = pd.read_csv(<jak2_file>, sep="\t")

ppara_df = pd.read_csv(<ppara_file>, sep="\t")
ppard_df = pd.read_csv(<ppard_file>, sep="\t")
pparg_df = pd.read_csv(<pparg_file>, sep="\t")


print('hERG size:',len([i for i in herg_df['mol'] if i is not None]))
print('ppara size:',len([i for i in ppara_df['mol'] if i is not None]))
print('ppard size:',len([i for i in ppard_df['mol'] if i is not None]))
print('pparg size:',len([i for i in pparg_df['mol'] if i is not None]))
print('CAD size:',len([i for i in cad_df['mol'] if i is not None]))
print('LCK size:',len([i for i in lck_df['mol'] if i is not None]))

print('ChromLogD size:',len([i for i in clogd_df['mol'] if i is not None]))

print('JAK2 size:',len([i for i in jak2_df['mol'] if i is not None]))

print('CYP3A4 size:',len([i for i in cyp3a4_df['mol'] if i is not None]))


In [None]:
# Check feature importances
study_obj = optuna.create_study(storage=<optuna_run_db>, study_name=<optuna_study_name>, load_if_exists=True)
optuna.visualization.plot_param_importances(
        study_obj, target=lambda t: t.duration.total_seconds(), target_name="value"
    )


## 1. Pre-process data end points

In [None]:
def create_full_endpoint_hparam_df(study_endpoints, database_path, group_by_layer_type=False, with_lr_cutoff=False, no_top5_bottom5=False):
    """
    Creates DataFrame of top5 and bottom5 values for each study endpoint given
    
    Inputs
    -------
    study_endpoints: dictionary
        Keys are dataset names and values are study strings e.g. {"hERG": "herg_study_new_hparam_test", "ChromLogD": "clogd_study_new_hparam_test"}
    database_path: string
        Path where SQLite databases are stored for each study e.g. "sqlite:////hpc/mydata/upt/ns833749/pytorch_geometric/graphgym/"
    group_by_layer_type: bool
        True if top5 and bottom5 per layer type rather than across all parameters
    with_lr_cutoff: bool
        True if implementing learning rate cut-off for identification of top5 and bottom5 hyperparameters
    no_top5_bottom5: bool
        True if only interested in full DataFrame
        
    Returns
    -------
    full_final_df: DataFrame
        Pandas DataFrame containing all studies given for study_endpoints
    
    """
    
    full_final_df = pd.DataFrame()
    
    for study in study_endpoints.keys():
        study_obj = optuna.create_study(storage=database_path+study_endpoints[study]+".db", study_name=study_endpoints[study], load_if_exists=True)
        
        study_df = study_obj.trials_dataframe()
        
        study_df = study_df[study_df["state"] == "COMPLETE"]
        study_df["dataset"] = study
        
        if no_top5_bottom5:
            full_final_df = pd.concat([full_final_df, study_df])
            
        else:
            if group_by_layer_type:
                study_df_layers = list(study_df["params_layer_type"].unique())

                top5 = pd.DataFrame()
                bottom5 = pd.DataFrame()

                for layer_type in study_df_layers:
                    if with_lr_cutoff:
                        bottom5_per_layer = study_df[(study_df["params_layer_type"] == layer_type) & (study_df["params_base_lr"] >= 0.0001)].sort_values('value')[-5:]
                        top5_per_layer = study_df[(study_df["params_layer_type"] == layer_type) & (study_df["params_base_lr"] >= 0.0001)].sort_values('value')[:5]
                    else:
                        bottom5_per_layer = study_df[study_df["params_layer_type"] == layer_type].sort_values('value')[-5:]
                        top5_per_layer = study_df[study_df["params_layer_type"] == layer_type].sort_values('value')[:5]
                        
                    if len(top5_per_layer) == 5:
                        top5_per_layer["rank"] = [i+1 for i in range(5)]
                        top5_per_layer["category"] = "top5"

                    if len(bottom5_per_layer) == 5:
                        bottom5_per_layer["rank"] = [-(i+1) for i in range(5)][::-1]
                        bottom5_per_layer["category"] = "bottom5"

                    if len(top5_per_layer) == 5 and len(bottom5_per_layer) == 5:
                        top5 = pd.concat([top5, top5_per_layer])
                        bottom5 = pd.concat([bottom5, bottom5_per_layer])

            else:
                if with_lr_cutoff:
                    bottom5 = study_df[study_df["params_base_lr"] >= 0.0001].sort_values('value')[-5:]
                    top5 = study_df[study_df["params_base_lr"] >= 0.0001].sort_values('value')[:5]
                else:
                    bottom5 = study_df.sort_values('value')[-5:]
                    top5 = study_df.sort_values('value')[:5]

                bottom5["rank"] = [-(i+1) for i in range(5)][::-1]
                bottom5["category"] = "bottom5"
                
                top5["rank"] = [i+1 for i in range(5)]
                top5["category"] = "top5"

            study_df_concat = pd.concat([top5, bottom5])
            full_final_df = pd.concat([full_final_df, study_df_concat])
        
    return full_final_df

full_df_no_top_bottom = create_full_endpoint_hparam_df(study_endpoints={"hERG": <herg_study>, 
                                                "ChromLogD": <clogd_study>,
                                                "CAD": <cad_study>,
                                                "CYP3A4": <cyp3a4_study>,
                                               "LCK": <lck_study>,
                                               "JAK2": <jak2_study>,
                                               "PPAR_A": <ppara_study>,
                                               "PPAR_D": <ppard_study>,
                                               "PPAR_G": <pparg_study>},
                              database_path=<optuna_db_location>,
                                                  no_top5_bottom5=True)
        
full_df_endpoints = create_full_endpoint_hparam_df(study_endpoints={"hERG": <herg_study>, 
                                                "ChromLogD": <clogd_study>,
                                                "CAD": <cad_study>,
                                                "CYP3A4": <cyp3a4_study>,
                                               "LCK": <lck_study>,
                                               "JAK2": <jak2_study>,
                                               "PPAR_A": <ppara_study>,
                                               "PPAR_D": <ppard_study>,
                                               "PPAR_G": <pparg_study>},
                              database_path=<optuna_db_location>,
                                                  group_by_layer_type=False)

full_df_endpoints_with_lr_cutoff_bottom5 = create_full_endpoint_hparam_df(study_endpoints={"hERG": <herg_study>, 
                                                "ChromLogD": <clogd_study>,
                                                "CAD": <cad_study>,
                                                "CYP3A4": <cyp3a4_study>,
                                               "LCK": <lck_study>,
                                               "JAK2": <jak2_study>,
                                               "PPAR_A": <ppara_study>,
                                               "PPAR_D": <ppard_study>,
                                               "PPAR_G": <pparg_study>},
                              database_path=<optuna_db_location>,
                                                  group_by_layer_type=False, with_lr_cutoff=True)

full_df_endpoints_per_layer = create_full_endpoint_hparam_df(study_endpoints={"hERG": <herg_study>, 
                                                "ChromLogD": <clogd_study>,
                                                "CAD": <cad_study>,
                                                "CYP3A4": <cyp3a4_study>,
                                               "LCK": <lck_study>,
                                               "JAK2": <jak2_study>,
                                               "PPAR_A": <ppara_study>,
                                               "PPAR_D": <ppard_study>,
                                               "PPAR_G": <pparg_study>},
                              database_path=<optuna_db_location>,
                                                  group_by_layer_type=True)

full_df_endpoints_per_layer_with_lr_cutoff_bottom5 = create_full_endpoint_hparam_df(study_endpoints={"hERG": <herg_study>, 
                                                "ChromLogD": <clogd_study>,
                                                "CAD": <cad_study>,
                                                "CYP3A4": <cyp3a4_study>,
                                               "LCK": <lck_study>,
                                               "JAK2": <jak2_study>,
                                               "PPAR_A": <ppara_study>,
                                               "PPAR_D": <ppard_study>,
                                               "PPAR_G": <pparg_study>},
                              database_path=<optuna_db_location>,
                                                  group_by_layer_type=True, with_lr_cutoff=True)

In [None]:
bottom5_data_lr = full_df_endpoints[(full_df_endpoints["category"] == 'bottom5') & (full_df_endpoints["params_base_lr"] <= 0.0001)]["params_base_lr"]
top5_data_lr = full_df_endpoints[(full_df_endpoints["category"] == 'top5') & (full_df_endpoints["params_base_lr"] <= 0.0001)]["params_base_lr"]

bottom5_data_lr.hist(bins=10, alpha=0.5, label='bottom5', weights=np.ones(len(bottom5_data_lr)) / len(bottom5_data_lr))
top5_data_lr.hist(bins=10, alpha=0.5, label='top5', weights=np.ones(len(top5_data_lr)) / len(top5_data_lr))

plt.legend()
plt.xlabel("Learning rate")
plt.ylabel("Fraction of data")
plt.show()

## 2. Analyse hyperparamters

In [None]:
def custom_colour(df):
    if len(df) <= 10000:
        colour = 'red'
    if len(df) > 10000 and len(df) <= 20000:
        colour = 'orangered'
    if len(df) > 20000 and len(df) <= 30000:
        colour = 'darkorange'
    if len(df) > 30000 and len(df) <= 40000:
        colour = 'orange'
    if len(df) > 40000 and len(df) <= 100000:
        colour = 'gold'
    if len(df) > 100000 and len(df) <= 200000:
        colour = 'yellow'
    if len(df) > 200000 and len(df) <= 300000:
        colour = 'greenyellow'
    if len(df) > 300000:
        colour = 'green'
            
    return colour

datasets = {"hERG": herg_df, 
            "ChromLogD": clogd_df,
            "CAD": cad_df,
            "CYP3A4": cyp3a4_df,
           "LCK": lck_df,
           "JAK2": jak2_df,
           "PPAR_A": ppara_df,
           "PPAR_D": ppard_df,
           "PPAR_G": pparg_df}

custom_palette = {}
for dataset in datasets:
    colour = custom_colour(datasets[dataset])
    
    custom_palette[dataset] = colour


In [None]:
from matplotlib.collections import PolyCollection
from matplotlib.colors import to_rgb

def split_violin_plot(df, param_name, xlim=None):
    """
    Generates split violin plot given DataFrame of studies and parameter name
    
    df: DataFrame
        DataFrame of studies
    param_name: string
        Name of parameter to plot
    ylim: list
        Lower and upper bounds of y limit e.g. [-0.005,0.005]
    """
    
    datasets = df['dataset'].unique()
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    ax = sns.violinplot(data=df, y="dataset", x=param_name, hue="category",
                   split=True, inner="quart", fill=False, palette=['.4', '.7'])
        
    colors = [i for i in custom_palette.values()]
    for ind, violin in enumerate(ax.findobj(PolyCollection)):
        rgb = to_rgb(colors[ind // 2])
        if ind % 2 != 0:
            rgb = 0.5 + 0.5 * np.array(rgb)
        violin.set_facecolor(rgb)

    if xlim:
        ax.set_xlim(xlim[0],xlim[1])
        
    plt.show()
    
def split_violin_plot_per_layer_type(df, param_name, xlim=None):
    """
    Generates split violin plot per layer type given DataFrame of studies and a parameter name
    
    df: DataFrame
        DataFrame of studies
    dataset: string
        Name of dataset
    param_name: string
        Name of parameter to plot
    ylim: list
        Lower and upper bounds of y limit e.g. [-0.005,0.005]
    """
    
    for dataset in list(df["dataset"].unique()):
        df_dataset = df[df["dataset"] == dataset]

        fig, ax = plt.subplots(figsize=(10, 8))

        ax = sns.violinplot(data=df_dataset, y="params_layer_type", x=param_name, hue="category",
                       split=True, inner="quart", fill=False, palette=['.4', '.7'])
                       #palette={"top5": "g", "bottom5": ".35"})

        if xlim:
            ax.set_xlim(xlim[0],xlim[1])

        plt.title(dataset)
        plt.show()
    

### Plots per layer

#### 1. Without cutoff for lr bottom5

In [None]:
# params "params_base_lr", "params_dropout", "params_layers_mp", "params_lr_decay", "params_dim_inner", "params_weight_decay"

split_violin_plot_per_layer_type(df=full_df_endpoints_per_layer, param_name="params_layers_mp")


#### 2. With lr cutoff bottom5

In [None]:
# params "params_base_lr", "params_dropout", "params_layers_mp", "params_lr_decay", "params_dim_inner", "params_weight_decay"

split_violin_plot_per_layer_type(df=full_df_endpoints_per_layer_with_lr_cutoff_bottom5, param_name="params_layers_mp")


### Plots per dataset

#### 1. Without lr cutoff

In [None]:
continuous_hparams = ["params_base_lr", "params_dropout", "params_layers_mp", "params_lr_decay", "params_dim_inner", "params_weight_decay"]

######################################################################################################################################################################
# COLOURS (redder means less data points):
# Datapoints: <= 10000: red, <= 20000: orangered, <= 30000: darkorange, <= 40000: orange, <= 100000: gold, <= 200000: yellow, <= 300000: greenyellow, > 300000: green
######################################################################################################################################################################

for param in continuous_hparams:
    if param == "params_base_lr":
        xlim = [-0.005,0.005]
    else:
        xlim = None
        
    split_violin_plot(df=full_df_endpoints, param_name=param, xlim=xlim)

#### 2. With lr cutoff

In [None]:
continuous_hparams = ["params_base_lr", "params_dropout", "params_layers_mp", "params_lr_decay", "params_dim_inner", "params_weight_decay"]

######################################################################################################################################################################
# COLOURS (redder means less data points):
# Datapoints: <= 10000: red, <= 20000: orangered, <= 30000: darkorange, <= 40000: orange, <= 100000: gold, <= 200000: yellow, <= 300000: greenyellow, > 300000: green
######################################################################################################################################################################

for param in continuous_hparams:
    split_violin_plot(df=full_df_endpoints_with_lr_cutoff_bottom5, param_name=param, xlim=xlim)
    

In [None]:
continuous_hparams = ["params_base_lr", "params_dropout", "params_layers_mp", "params_lr_decay", "params_dim_inner", "params_weight_decay"]
full_df_cont_hparams = full_df_no_top_bottom[['number', 'value', 'params_base_lr', 'params_dim_inner', 'params_dropout', 'params_layers_mp', 'params_layers_post_mp', 'params_layers_pre_mp', 'params_lr_decay', 'params_weight_decay', 'dataset']]


In [None]:
# WITHOUT LR CUTOFF, ALL DATA
plt.figure(figsize=(10,8))

# Params most correlated to output value: layers_post_mp, layers_pre_mp, lr_decay, dim_inner
# Params most correlated to learning rate: layers_mp, dropout, weight_decay

corr = full_df_cont_hparams.corr()
sns.heatmap(corr, cmap='icefire', annot=True)


In [None]:
# WITHOUT LR CUTOFF, TOP5/BOTTOM5
plt.figure(figsize=(10,8))

# Params most correlated to output value: layers_post_mp, layers_mp, dropout
# Params most correlated to learning rate: layers_post_mp, layers_pre_mp, dropout

corr = full_df_endpoints[['number', 'value', 'params_base_lr', 'params_dim_inner', 'params_dropout', 'params_layers_mp', 'params_layers_post_mp', 'params_layers_pre_mp', 'params_lr_decay', 'params_weight_decay', 'dataset']].corr()
sns.heatmap(corr, cmap='icefire', annot=True)


In [None]:
# WITH LR CUTOFF
plt.figure(figsize=(10,8))

# Params most correlated to output value: layers_mp, layers_post_mp, weight_decay
# Params most correlated to learning rate: layers_pre_mp, layers_post_mp, dropout, dim_inner

corr = full_df_endpoints_with_lr_cutoff_bottom5[['number', 'value', 'params_base_lr', 'params_dim_inner', 'params_dropout', 'params_layers_mp', 'params_layers_post_mp', 'params_layers_pre_mp', 'params_lr_decay', 'params_weight_decay', 'dataset']].corr()
sns.heatmap(corr, cmap='icefire', annot=True)


### Are important parameters correlated to dataset size?

In [None]:
dataset_sizes = {}

for dataset in full_df_no_top_bottom['dataset'].unique():
    dataset_sizes[dataset] = len(full_df_no_top_bottom[full_df_no_top_bottom['dataset'] == dataset])
    

In [None]:
full_df_endpoints_with_lr_cutoff_bottom5['dataset_size'] = full_df_endpoints_with_lr_cutoff_bottom5['dataset'].apply(lambda x: [v for k, v in dataset_sizes.items() if k in x][0])


In [None]:
# WITH LR CUTOFF (for top5)
plt.figure(figsize=(10,8))

# Dataset size seems most correlated to learning rate
full_df_endpoints_with_lr_cutoff_bottom5_top5only = full_df_endpoints_with_lr_cutoff_bottom5[full_df_endpoints_with_lr_cutoff_bottom5['category'] == 'top5']

corr = full_df_endpoints_with_lr_cutoff_bottom5_top5only[['number', 'value', 'params_base_lr', 'params_dim_inner', 'params_dropout', 'params_layers_mp', 'params_layers_post_mp', 'params_layers_pre_mp', 'params_lr_decay', 'params_weight_decay', 'dataset','dataset_size']].corr()
sns.heatmap(corr, cmap='icefire', annot=True)


In [None]:
plt.figure(figsize=(10,8))

for dataset in full_df_no_top_bottom['dataset'].unique():
    subset = full_df_endpoints_with_lr_cutoff_bottom5[(full_df_endpoints_with_lr_cutoff_bottom5['dataset'] == dataset) & (full_df_endpoints_with_lr_cutoff_bottom5['category'] == 'top5')]
    plt.scatter(subset['dataset_size'].mean(), subset['params_layers_mp'].mean(), s=(dataset_sizes[dataset]/4)**2, label=dataset)
    
plt.xlabel('Dataset size')
plt.ylabel('Mean number of message passing layers')
plt.title('Mean message passing layers and dataset size for top 5 hyperparameters')
plt.legend()
plt.show()


### Spread of RMSE values

In [None]:
for dataset in full_df_no_top_bottom['dataset'].unique():
    subset = full_df_endpoints_with_lr_cutoff_bottom5[(full_df_endpoints_with_lr_cutoff_bottom5['dataset'] == dataset) & (full_df_endpoints_with_lr_cutoff_bottom5['category'] == 'top5')]

    sns.histplot(subset['value'], stat='density', label=dataset)
        
plt.legend()

### Correlation of layers_mp and dim_inner to size of dataset

In [None]:
plt.figure(figsize=(10,8))

for dataset in full_df_no_top_bottom['dataset'].unique():
    subset = full_df_endpoints_per_layer_with_lr_cutoff_bottom5[(full_df_endpoints_per_layer_with_lr_cutoff_bottom5['dataset'] == dataset) & (full_df_endpoints_per_layer_with_lr_cutoff_bottom5['category'] == 'top5')]
    plt.scatter(subset['params_layers_mp'].mean(), subset['params_dim_inner'].mean(), s=(len(full_df_endpoints_per_layer_with_lr_cutoff_bottom5[(full_df_endpoints_per_layer_with_lr_cutoff_bottom5['dataset'] == dataset)])/2)**2, label=dataset)
    
plt.xlabel('Mean number of message passing layers')
plt.ylabel('Mean number of inner dimensions')
plt.title('Mean dimensions and message passing layers for top 5 hyperparameters where size corresponds to dataset size')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10,8))

for dataset in full_df_no_top_bottom['dataset'].unique():
    subset = full_df_endpoints_per_layer_with_lr_cutoff_bottom5[(full_df_endpoints_per_layer_with_lr_cutoff_bottom5['dataset'] == dataset) & (full_df_endpoints_per_layer_with_lr_cutoff_bottom5['category'] == 'top5')]
    plt.scatter(subset['params_weight_decay'].mean(), subset['params_dim_inner'].mean(), s=(len(full_df_endpoints_per_layer_with_lr_cutoff_bottom5[(full_df_endpoints_per_layer_with_lr_cutoff_bottom5['dataset'] == dataset)])/2)**2, label=dataset)
    
plt.xlabel('Mean weight decay')
plt.ylabel('Mean number of inner dimensions')
plt.title('Mean dimensions and weight decay for top 5 hyperparameters where size corresponds to dataset size')
plt.legend()
plt.show()
    

In [None]:
plt.figure(figsize=(10,8))

for dataset in full_df_no_top_bottom['dataset'].unique():
    subset = full_df_endpoints_per_layer_with_lr_cutoff_bottom5[(full_df_endpoints_per_layer_with_lr_cutoff_bottom5['dataset'] == dataset) & (full_df_endpoints_per_layer_with_lr_cutoff_bottom5['category'] == 'bottom5')]
    plt.scatter(subset['params_layers_mp'].mean(), subset['params_dim_inner'].mean(), s=(len(full_df_endpoints_per_layer_with_lr_cutoff_bottom5[(full_df_endpoints_per_layer_with_lr_cutoff_bottom5['dataset'] == dataset)])/2)**2, label=dataset)
    
plt.xlabel('Mean number of message passing layers')
plt.ylabel('Mean number of inner dimensions')
plt.title('Mean dimensions and message passing layers for bottom 5 hyperparameters where size corresponds to dataset size')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10,8))

for dataset in full_df_no_top_bottom['dataset'].unique():
    subset = full_df_no_top_bottom[(full_df_no_top_bottom['dataset'] == dataset)]
    plt.scatter(subset['params_layers_mp'].mean(), subset['params_dim_inner'].mean(), s=(len(full_df_no_top_bottom[(full_df_no_top_bottom['dataset'] == dataset)])/4)**2, label=dataset)
    
plt.xlabel('Mean number of message passing layers')
plt.ylabel('Mean number of inner dimensions')
plt.title('Mean dimensions and message passing layers for all hyperparameters where size corresponds to dataset size')
plt.legend()
plt.show()


In [None]:
discrete_hparams = ["params_layer_type", "params_act", "params_attention_type"]

def frac_appearance(df, param_name):
    """
    Generates statistics on fraction of times a discrete parameter appears in a dataset
    """
    dataset_param_frac_dict = {i:{} for i in df["dataset"].tolist()}
    
    all_disc_param_vals_poss = list(df[param_name].unique())
    
    params_frac_dict_top5_across_datasets = {i:0 for i in all_disc_param_vals_poss}
    params_frac_dict_bottom5_across_datasets = {i:0 for i in all_disc_param_vals_poss}
    
    for dataset in list(df["dataset"].unique()):
        params_frac_dict_top5 = {i:[] for i in all_disc_param_vals_poss}
        params_frac_dict_bottom5 = {i:[] for i in all_disc_param_vals_poss}
    
        top5_appearances_fraction = df[(df["dataset"] == dataset) & (df["category"] == "top5")][param_name].tolist()
        bottom5_appearances_fraction = df[(df["dataset"] == dataset) & (df["category"] == "bottom5")][param_name].tolist()
        
        for param_val in all_disc_param_vals_poss:
            top5_count = top5_appearances_fraction.count(param_val)
            params_frac_dict_top5[param_val] = top5_count
            
            bottom5_count = bottom5_appearances_fraction.count(param_val)
            params_frac_dict_bottom5[param_val] = bottom5_count
            
            params_frac_dict_top5_across_datasets[param_val] += top5_count
            params_frac_dict_bottom5_across_datasets[param_val] += bottom5_count
            
        params_frac_dict_top5 = {k: v for k, v in params_frac_dict_top5.items() if v}
        params_frac_dict_bottom5 = {k: v for k, v in params_frac_dict_bottom5.items() if v}
            
        dataset_param_frac_dict[dataset] = {'top5': params_frac_dict_top5, 'bottom5': params_frac_dict_bottom5}
        
    return dataset_param_frac_dict, params_frac_dict_top5_across_datasets, params_frac_dict_bottom5_across_datasets

    

In [None]:
dataset_param_frac_dict, params_frac_dict_top5_across_datasets, params_frac_dict_bottom5_across_datasets = frac_appearance(df=full_df_endpoints, param_name="params_layer_type")

print('top5 across datasets:', params_frac_dict_top5_across_datasets)
print('bottom5 across datasets:', params_frac_dict_bottom5_across_datasets)

In [None]:
dataset_param_frac_dict, params_frac_dict_top5_across_datasets, params_frac_dict_bottom5_across_datasets = frac_appearance(df=full_df_endpoints, param_name="params_act")

print('top5 across datasets:', params_frac_dict_top5_across_datasets)
print('bottom5 across datasets:', params_frac_dict_bottom5_across_datasets)

In [None]:
dataset_param_frac_dict, params_frac_dict_top5_across_datasets, params_frac_dict_bottom5_across_datasets = frac_appearance(df=full_df_endpoints, param_name="params_attention_type")

print('top5 across datasets:', params_frac_dict_top5_across_datasets)
print('bottom5 across datasets:', params_frac_dict_bottom5_across_datasets)

In [None]:
dataset_param_frac_dict, params_frac_dict_top5_across_datasets, params_frac_dict_bottom5_across_datasets = frac_appearance(df=full_df_endpoints, param_name="params_graph_pooling")

print('top5 across datasets:', params_frac_dict_top5_across_datasets)
print('bottom5 across datasets:', params_frac_dict_bottom5_across_datasets)

In [None]:
dataset_param_frac_dict, params_frac_dict_top5_across_datasets, params_frac_dict_bottom5_across_datasets = frac_appearance(df=full_df_endpoints, param_name="params_stage_type")

print('top5 across datasets:', params_frac_dict_top5_across_datasets)
print('bottom5 across datasets:', params_frac_dict_bottom5_across_datasets)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

dataset_with_norm = pd.DataFrame()

for dataset in full_df_no_top_bottom['dataset'].unique():
    subset_dataset = full_df_no_top_bottom[(full_df_no_top_bottom['dataset'] == dataset)]
    dataset_df = datasets[dataset]
    
    # Get column name containing pIC50 or IC50
    col = [i for i in dataset_df.columns if '_mean' in i.lower()]
    col = col[0]
    
    # Mean
    mean = dataset_df[col].mean()
    # Standard deviation
    stddev = dataset_df[col].std()
    # 90th percentile
    perc_09 = np.percentile(dataset_df[col], 90)
    
    # Normalize with mean and standard deviation of dataset
    subset_dataset['Normalised RMSE'] = (subset_dataset['value'])/perc_09
    
    dataset_with_norm = pd.concat([dataset_with_norm, subset_dataset])


In [None]:
for dataset in full_df_no_top_bottom['dataset'].unique():
    subset_top10 = dataset_with_norm[(dataset_with_norm['dataset'] == dataset)].sort_values('Normalised RMSE')[:10]
    if subset_top10['Normalised RMSE'].mean()<5:
        sns.kdeplot(subset_top10['Normalised RMSE'], label=dataset)
        
plt.xlabel('RMSE')
plt.legend()


In [None]:
fig, axes = plt.subplots(3, 3, figsize=(10, 12))

axes_indices = [axes[0][0], axes[0][1], axes[0][2], axes[1][0], axes[1][1], axes[1][2], axes[2][0], axes[2][1], axes[2][2]]

model_type_colours = {'gatconv':'royalblue', 'ginconv':'darkorange', 'gcnconv':'forestgreen', 'generalsampleedgeconv':'red',
       'sageconv':'dimgrey', 'mlp':'brown', 'linear':'hotpink', 'splineconv':'grey', 'generalconv':'darkviolet',
       'generaledgeconv':'olive'}

labels_dict = {}

for idx, dataset in enumerate(dataset_with_norm['dataset'].unique()):
    subset_dataset = dataset_with_norm[(dataset_with_norm['dataset'] == dataset) & (dataset_with_norm["params_base_lr"] >= 0.0001)]
    
    for model_type in dataset_with_norm['params_layer_type'].unique():
        if model_type in ['generalsampleedgeconv', 'splineconv', 'generaledgeconv']:
            linetype = '-'
        else:
            linetype = '--'
            
        subset_dataset_model_type = subset_dataset[subset_dataset['params_layer_type'] == model_type].sort_values('Normalised RMSE')[:10]
        
        sns.kdeplot(subset_dataset_model_type['Normalised RMSE'], label=model_type, linestyle=linetype, ax=axes_indices[idx], color=model_type_colours[model_type])
        
        axes_indices[idx].set_xlim(left=0.0)
        axes_indices[idx].set_title(dataset, fontsize=10)
        axes_indices[idx].set_xlabel(xlabel='')
        axes_indices[idx].set_ylabel(ylabel='')
        
        if model_type not in labels_dict:
            handles, labels = axes_indices[idx].get_legend_handles_labels()
            by_label = dict(zip(labels, handles))
            
            if model_type in by_label:
                labels_dict[model_type] = by_label[model_type]
        
fig.legend(labels_dict.values(), labels_dict.keys(), loc='upper right', bbox_to_anchor=(1.22, 0.98), frameon=False)

fig.supxlabel('RMSE/90th percentile')
fig.supylabel('Density')
fig.tight_layout()
