In [2]:
import numpy as np
import pandas as pd
import pickle
import json
from modelling_utilities import post_process_epoch_metrics



In [3]:
def detailed_reportout(metrics):
    
    collect_fold_values = {}
    avg_score_per_metric = {}
    avg_score_per_fold = {}
    max_score_per_metric = {}
    max_score_per_fold = {}
    min_score_per_fold = {}
    min_score_per_metric = {}
    
    
    for data_slice, metrics_and_folds in metrics.items():
       
        collect_fold_values.setdefault(data_slice, {})
        avg_score_per_fold.setdefault(data_slice, {})
        max_score_per_fold.setdefault(data_slice, {})
        min_score_per_fold.setdefault(data_slice, {})

        for metric, fold_report in metrics_and_folds.items():
            
            avg_score_per_fold[data_slice][metric] = {}
            max_score_per_fold[data_slice][metric] = {}
            min_score_per_fold[data_slice][metric] = {}

            for fold_name, fold_values in fold_report.items():
                
                if fold_values:
                    avg_score_per_fold[data_slice][metric].setdefault(fold_name, 0)
                    avg_score_per_fold[data_slice][metric][fold_name] = np.round(np.mean(fold_values), 4)
                    max_score_per_fold[data_slice][metric][fold_name] = np.round(np.max(fold_values),  4)
                    min_score_per_fold[data_slice][metric][fold_name] = np.round(np.min(fold_values), 4)

                    collect_fold_values[data_slice].setdefault(metric, []).extend(fold_values)

            #averaged_across_epochs.setdefault(metric, {})[data_slice] = {fold: np.mean(scores) for fold, scores in enumerate(folds)}

        avg_score_per_metric[data_slice] = {metric: np.round(np.mean(scores), 4) for metric, scores in collect_fold_values[data_slice].items()}

        max_score_per_metric[data_slice] = {metric: np.round(np.max(scores), 4) for metric, scores in collect_fold_values[data_slice].items()}

        min_score_per_metric[data_slice] = {metric: np.round(np.min(scores),4) for metric, scores in collect_fold_values[data_slice].items()}

        
    output_dict = {'avg_score_per_fold': avg_score_per_fold,
                    'avg_score_per_metric':avg_score_per_metric,
                    'max_score_per_metric':max_score_per_metric,
                    'max_score_per_fold':max_score_per_fold, 
                    'min_score_per_fold':min_score_per_fold, 
                    'min_score_per_metric':min_score_per_metric}

    return output_dict 
    


In [4]:
def performance_summary(results_path):
    with open(results_path, "rb") as f:
        results = pickle.load(f)

    output_dict = detailed_reportout(results)

    for k,v in output_dict.items():
        
        print('Collection Type: ' + k)
        for dataslice, metrics_collect in v.items():

            print("Dataslice: ", dataslice)
            for metric_name, values in metrics_collect.items():
                print(f"Metric Summary Type: {metric_name} -- {values}")
            print('_____________________________________')



****

## CNN Experiments

Experiment 1) One Hot Encoding w no AA w no DropOut

In [44]:
performance_summary("model_results/ProteinCNN_One_Hot_Encoding_No_AA_No_DropOut_model_Training_Slice_exclude_all_q_values_StandardScaler_one_hot.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_one_hot
Metric Summary Type: train_loss -- {'0_fold': 0.2166, '1_fold': 0.2382, '2_fold': 0.2229, '3_fold': 0.2245}
Metric Summary Type: val_loss -- {'0_fold': 0.2324, '1_fold': 0.2424, '2_fold': 0.2365, '3_fold': 0.2266}
Metric Summary Type: train_r2 -- {'0_fold': 0.5699, '1_fold': 0.5251, '2_fold': 0.5577, '3_fold': 0.5547}
Metric Summary Type: val_r2 -- {'0_fold': 0.5372, '1_fold': 0.5236, '2_fold': 0.5285, '3_fold': 0.5472}
Metric Summary Type: train_mae -- {'0_fold': 0.3617, '1_fold': 0.3824, '2_fold': 0.3682, '3_fold': 0.3676}
Metric Summary Type: val_mae -- {'0_fold': 0.378, '1_fold': 0.3893, '2_fold': 0.3825, '3_fold': 0.3753}
Metric Summary Type: train_rmse -- {'0_fold': 0.4492, '1_fold': 0.4707, '2_fold': 0.4547, '3_fold': 0.456}
Metric Summary Type: val_rmse -- {'0_fold': 0.4712, '1_fold': 0.4797, '2_fold': 0.4737, '3_fold': 0.4641}
_____________________________________
Dataslice:  exclude_all_q_val

Experiment 2) Embedding w no AA w no DropOut

In [45]:
performance_summary("./model_results/ProteinCNN_Embedding_No_AA_No_Dropout_model_Training_Slice_exclude_all_q_values_StandardScaler_embeddings.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- {'0_fold': 0.2344, '1_fold': 0.243, '2_fold': 0.2135, '3_fold': 0.2165}
Metric Summary Type: val_loss -- {'0_fold': 0.2525, '1_fold': 0.2392, '2_fold': 0.223, '3_fold': 0.2078}
Metric Summary Type: train_r2 -- {'0_fold': 0.5346, '1_fold': 0.5154, '2_fold': 0.5764, '3_fold': 0.5707}
Metric Summary Type: val_r2 -- {'0_fold': 0.4972, '1_fold': 0.5298, '2_fold': 0.5554, '3_fold': 0.5848}
Metric Summary Type: train_mae -- {'0_fold': 0.3784, '1_fold': 0.3793, '2_fold': 0.3529, '3_fold': 0.3571}
Metric Summary Type: val_mae -- {'0_fold': 0.3987, '1_fold': 0.3883, '2_fold': 0.3646, '3_fold': 0.3588}
Metric Summary Type: train_rmse -- {'0_fold': 0.4661, '1_fold': 0.4705, '2_fold': 0.442, '3_fold': 0.4481}
Metric Summary Type: val_rmse -- {'0_fold': 0.4906, '1_fold': 0.4766, '2_fold': 0.4576, '3_fold': 0.4456}
_____________________________________
Dataslice:  exclude_all_q_v

Experiment 3 - Embedding w AA w no DropOut

In [46]:
performance_summary("./model_results/ProteinCNN_Embedding_w_AA_No_Dropout_model_Training_Slice_exclude_all_q_values_StandardScaler_embeddings.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- {'0_fold': 0.2179, '1_fold': 0.2367, '2_fold': 0.2662, '3_fold': 0.2216}
Metric Summary Type: val_loss -- {'0_fold': 0.2181, '1_fold': 0.2373, '2_fold': 0.276, '3_fold': 0.2355}
Metric Summary Type: train_r2 -- {'0_fold': 0.5674, '1_fold': 0.5281, '2_fold': 0.4717, '3_fold': 0.5605}
Metric Summary Type: val_r2 -- {'0_fold': 0.5658, '1_fold': 0.5337, '2_fold': 0.4497, '3_fold': 0.5295}
Metric Summary Type: train_mae -- {'0_fold': 0.3581, '1_fold': 0.3779, '2_fold': 0.3999, '3_fold': 0.3637}
Metric Summary Type: val_mae -- {'0_fold': 0.3696, '1_fold': 0.3876, '2_fold': 0.424, '3_fold': 0.3789}
Metric Summary Type: train_rmse -- {'0_fold': 0.4475, '1_fold': 0.4671, '2_fold': 0.4923, '3_fold': 0.4546}
Metric Summary Type: val_rmse -- {'0_fold': 0.4569, '1_fold': 0.4767, '2_fold': 0.5169, '3_fold': 0.472}
_____________________________________
Dataslice:  exclude_all_q_v

Experiment 4 - Embedding w AA w DropOut - MinMax

In [48]:
performance_summary("./model_results/ProteinCNN_Embedding_w_AA_w_Dropout_model_Training_Slice_exclude_all_q_values_StandardScaler_embeddings.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- {'0_fold': 0.301, '1_fold': 0.3272, '2_fold': 0.2584, '3_fold': 0.2879}
Metric Summary Type: val_loss -- {'0_fold': 0.2628, '1_fold': 0.2559, '2_fold': 0.2414, '3_fold': 0.2573}
Metric Summary Type: train_r2 -- {'0_fold': 0.4024, '1_fold': 0.3475, '2_fold': 0.4873, '3_fold': 0.4291}
Metric Summary Type: val_r2 -- {'0_fold': 0.4768, '1_fold': 0.497, '2_fold': 0.5185, '3_fold': 0.4859}
Metric Summary Type: train_mae -- {'0_fold': 0.4312, '1_fold': 0.4465, '2_fold': 0.4019, '3_fold': 0.4231}
Metric Summary Type: val_mae -- {'0_fold': 0.4134, '1_fold': 0.4061, '2_fold': 0.3923, '3_fold': 0.4085}
Metric Summary Type: train_rmse -- {'0_fold': 0.5318, '1_fold': 0.5525, '2_fold': 0.4955, '3_fold': 0.5215}
Metric Summary Type: val_rmse -- {'0_fold': 0.5046, '1_fold': 0.4969, '2_fold': 0.4837, '3_fold': 0.4979}
_____________________________________
Dataslice:  exclude_all_q_

Experiment 5) Embedding Max Overfitting

In [None]:
performance_summary("model_results/ProteinCNN_Embedding_w_AA_w_Dropout_w_BatchNorm_model_Training_Slice_exclude_high_q_values_MinMax_embeddings.pkl")

Experiment 6) OneHot Encoding w AA w Dropout

In [51]:
performance_summary("./model_results/ProteinCNN_OneHot_w_AA_w_Dropout_model_Training_Slice_exclude_high_q_values_MinMax_one_hot.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_one_hot
Metric Summary Type: train_loss -- {'0_fold': 0.3033, '1_fold': 0.3133}
Metric Summary Type: val_loss -- {'0_fold': 0.2694, '1_fold': 0.2715}
Metric Summary Type: train_r2 -- {'0_fold': 0.3947, '1_fold': 0.3803}
Metric Summary Type: val_r2 -- {'0_fold': 0.4671, '1_fold': 0.4582}
Metric Summary Type: train_mae -- {'0_fold': 0.4382, '1_fold': 0.4461}
Metric Summary Type: val_mae -- {'0_fold': 0.419, '1_fold': 0.4241}
Metric Summary Type: train_rmse -- {'0_fold': 0.5371, '1_fold': 0.5453}
Metric Summary Type: val_rmse -- {'0_fold': 0.5107, '1_fold': 0.5141}
_____________________________________
Collection Type: avg_score_per_metric
Dataslice:  exclude_high_q_values_MinMax_one_hot
Metric Summary Type: train_loss -- 0.3083
Metric Summary Type: val_loss -- 0.2704
Metric Summary Type: train_r2 -- 0.3875
Metric Summary Type: val_r2 -- 0.4626
Metric Summary Type: train_mae -- 0.44209998846054077
Metric Summary 

****
## RNN Experiments

Experiment 7) RNN_Embedding_w_AA_w_Dropout_Bidirectional

In [54]:
performance_summary("./model_results/ProteinRNN_Embedding_w_AA_w_Dropout_Bidirectional_model_Training_Slice_exclude_high_q_values_MinMax_embeddings.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- {'0_fold': 0.5333, '1_fold': 0.5332, '2_fold': 0.5514}
Metric Summary Type: val_loss -- {'0_fold': 0.5033, '1_fold': 0.5096, '2_fold': 0.4962}
Metric Summary Type: train_r2 -- {'0_fold': -0.0623, '1_fold': -0.0659, '2_fold': -0.0862}
Metric Summary Type: val_r2 -- {'0_fold': 0.0051, '1_fold': -0.0004, '2_fold': -0.0032}
Metric Summary Type: train_mae -- {'0_fold': 0.6345, '1_fold': 0.6333, '2_fold': 0.6412}
Metric Summary Type: val_mae -- {'0_fold': 0.6216, '1_fold': 0.6304, '2_fold': 0.6201}
Metric Summary Type: train_rmse -- {'0_fold': 0.7299, '1_fold': 0.7298, '2_fold': 0.7419}
Metric Summary Type: val_rmse -- {'0_fold': 0.7094, '1_fold': 0.7138, '2_fold': 0.7044}
_____________________________________
Collection Type: avg_score_per_metric
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- 0.5393
Metric Summary Type: val_loss -

Experiment 8) Embedding_w_AA_w_DropOut_No_Bidirectional

In [56]:
performance_summary("./model_results/ProteinRNN_Embedding_w_AA_w_Dropout_model_Training_Slice_exclude_high_q_values_MinMax_embeddings.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- {'0_fold': 0.5319, '1_fold': 0.5359, '2_fold': 0.5391}
Metric Summary Type: val_loss -- {'0_fold': 0.5022, '1_fold': 0.5031, '2_fold': 0.4939}
Metric Summary Type: train_r2 -- {'0_fold': -0.0594, '1_fold': -0.0712, '2_fold': -0.062}
Metric Summary Type: val_r2 -- {'0_fold': 0.0074, '1_fold': 0.0123, '2_fold': 0.0015}
Metric Summary Type: train_mae -- {'0_fold': 0.6343, '1_fold': 0.6338, '2_fold': 0.6353}
Metric Summary Type: val_mae -- {'0_fold': 0.6198, '1_fold': 0.6311, '2_fold': 0.6202}
Metric Summary Type: train_rmse -- {'0_fold': 0.7286, '1_fold': 0.7314, '2_fold': 0.7335}
Metric Summary Type: val_rmse -- {'0_fold': 0.7086, '1_fold': 0.7093, '2_fold': 0.7027}
_____________________________________
Collection Type: avg_score_per_metric
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- 0.5356
Metric Summary Type: val_loss -- 0

Experiment 9) GRU Architecture_w_AA_w_DropOut

In [58]:
performance_summary("./model_results/ProteinRNN_GRU_Embedding_w_AA_w_Dropout_Bidirectional_model_Training_Slice_exclude_high_q_values_MinMax_embeddings.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- {'0_fold': 0.5408, '1_fold': 0.5372, '2_fold': 0.5438}
Metric Summary Type: val_loss -- {'0_fold': 0.5032, '1_fold': 0.5058, '2_fold': 0.4945}
Metric Summary Type: train_r2 -- {'0_fold': -0.0772, '1_fold': -0.0737, '2_fold': -0.0713}
Metric Summary Type: val_r2 -- {'0_fold': 0.0053, '1_fold': 0.0071, '2_fold': 0.0003}
Metric Summary Type: train_mae -- {'0_fold': 0.6383, '1_fold': 0.6343, '2_fold': 0.6373}
Metric Summary Type: val_mae -- {'0_fold': 0.6193, '1_fold': 0.628, '2_fold': 0.617}
Metric Summary Type: train_rmse -- {'0_fold': 0.7346, '1_fold': 0.7322, '2_fold': 0.7366}
Metric Summary Type: val_rmse -- {'0_fold': 0.7093, '1_fold': 0.7111, '2_fold': 0.7031}
_____________________________________
Collection Type: avg_score_per_metric
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- 0.5406
Metric Summary Type: val_loss -- 0.

Experiment 10) Padding Re-training

In [5]:
performance_summary("./model_results/ProteinCNN_Embedding_w_AA_w_Dropout_w_PADDING_model_Training_Slice_exclude_high_q_values_MinMax_embeddings.pkl")

Collection Type: avg_score_per_fold
Dataslice:  exclude_high_q_values_MinMax_embeddings
Metric Summary Type: train_loss -- {'0_fold': 0.2568, '1_fold': 0.2682, '2_fold': 0.2432, '3_fold': 0.2475}
Metric Summary Type: val_loss -- {'0_fold': 0.2321, '1_fold': 0.2505, '2_fold': 0.2338, '3_fold': 0.2306}
Metric Summary Type: train_r2 -- {'0_fold': 0.4902, '1_fold': 0.4652, '2_fold': 0.5174, '3_fold': 0.5092}
Metric Summary Type: val_r2 -- {'0_fold': 0.5379, '1_fold': 0.5077, '2_fold': 0.5338, '3_fold': 0.5393}
Metric Summary Type: train_mae -- {'0_fold': 0.3956, '1_fold': 0.4092, '2_fold': 0.3879, '3_fold': 0.3935}
Metric Summary Type: val_mae -- {'0_fold': 0.3847, '1_fold': 0.3954, '2_fold': 0.3828, '3_fold': 0.384}
Metric Summary Type: train_rmse -- {'0_fold': 0.4896, '1_fold': 0.5045, '2_fold': 0.4794, '3_fold': 0.4863}
Metric Summary Type: val_rmse -- {'0_fold': 0.4734, '1_fold': 0.4888, '2_fold': 0.4745, '3_fold': 0.4718}
_____________________________________
Collection Type: avg_scor