In [6]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

In [2]:
prefix = 'v003'
import pandas as pd
import os

def aggregate_pred_dataframe(files):
    dfs = [pd.read_csv(f) for f in files]
    final_df = dfs[0].rename(columns={'pred': 'pred_0'})
    for i in range(1, len(dfs)):
        final_df[f'pred_{i}'] = dfs[i]['pred']
    return final_df.reset_index()

def get_preds_with_prefix(prefix, seed_start=0, seed_end=20):
    csv_files = [file for file in os.listdir('preds') if file.endswith('.csv')]
    csv_files.sort()
    len(csv_files)
    preds = []
    for seed in range(seed_start, seed_end):
        filtered = [file for file in csv_files if file.startswith(f'{prefix}_{seed}')]
        if filtered:
            preds.append((filtered[-1], filtered[-2]))
    return preds

def aggregate_preds(preds):
    df_valid = aggregate_pred_dataframe([f'preds/{i[0]}' for i in preds])
    df_test = aggregate_pred_dataframe([f'preds/{i[1]}' for i in preds])
    
    return df_valid, df_test

df_valid, df_test = aggregate_preds(get_preds_with_prefix('v003', 0, 20))

In [3]:
df_valid

Unnamed: 0,index,protein_index,residue_index,target,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,...,pred_10,pred_11,pred_12,pred_13,pred_14,pred_15,pred_16,pred_17,pred_18,pred_19
0,0,0,0,0,-11.82329,-17.31692,-14.00329,-14.27861,-17.69821,-14.16254,...,-14.90719,-15.90019,-15.05886,-13.62240,-17.32550,-14.80833,-13.21783,-15.16427,-15.13983,-12.68752
1,1,0,1,0,-12.89653,-16.55036,-14.23437,-17.36519,-17.57929,-14.18862,...,-10.79592,-16.82462,-16.03453,-12.49164,-15.48220,-11.96990,-14.31015,-13.33229,-11.94632,-12.62137
2,2,0,2,0,-13.06004,-13.55446,-11.17305,-14.10415,-12.92310,-11.08344,...,-6.70985,-12.79982,-11.34736,-7.52952,-11.36707,-7.48322,-9.58314,-13.33872,-11.29903,-8.02471
3,3,0,3,0,-14.42641,-15.41026,-13.66306,-17.28945,-16.58478,-15.67660,...,-12.47893,-17.27709,-14.75489,-9.13222,-18.93512,-16.26838,-15.84194,-16.14868,-10.88020,-12.17729
4,4,0,4,0,-15.64281,-18.14750,-19.49256,-18.04817,-17.38049,-17.93645,...,-15.07900,-16.80610,-18.48649,-16.55154,-20.94458,-18.37667,-20.22941,-18.88769,-14.35786,-15.01604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21515,21515,75,185,0,-15.92689,-14.98976,-15.33705,-12.78338,-13.38295,-12.63793,...,-11.92964,-16.21548,-12.06545,-15.60360,-15.63728,-15.11029,-13.25970,-17.85756,-10.77539,-14.22052
21516,21516,75,186,0,-20.20508,-14.89825,-16.06885,-13.64040,-16.90621,-13.19201,...,-15.21283,-19.94450,-16.82241,-19.93484,-16.96992,-16.43790,-16.20350,-15.91540,-13.64310,-15.86331
21517,21517,75,187,0,-9.50195,-8.18619,-8.48630,-5.57720,-7.81151,-3.33039,...,-11.10430,-9.98139,-10.39178,-10.19168,-11.40344,-9.32041,-7.58048,-7.45633,-6.45964,-7.29434
21518,21518,75,188,0,-17.47092,-15.77904,-14.38061,-12.88785,-12.45789,-14.50656,...,-16.97337,-15.22210,-11.73640,-13.63595,-16.75432,-14.93985,-12.23582,-15.14988,-11.82430,-11.43609


In [9]:
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, matthews_corrcoef

def generate_mean_ensemble_metrics(df, threshold=0):
    sum_preds = df[list(filter(lambda a: a.startswith('pred_'), df.columns.tolist()))].mean(axis=1)
    final_prediction = (sum_preds > threshold).astype(int)

    # Sensitivity (Recall)
    sensitivity = recall_score(df['target'], final_prediction)

    # Specificity
    tn, fp, fn, tp = confusion_matrix(df['target'], final_prediction).ravel()
    specificity = tn / (tn + fp)

    # Accuracy
    accuracy = accuracy_score(df['target'], final_prediction)

    # Precision
    precision = precision_score(df['target'], final_prediction)
    mcc = matthews_corrcoef(df['target'], final_prediction)
    return {
        "sensitivity": sensitivity,
        "specificity": specificity,
        "accuracy": accuracy,
        "precision": precision,
        "mcc": mcc,
    }

generate_mean_ensemble_metrics(df_test, threshold=0)

{'sensitivity': 0.5167464114832536,
 'specificity': 0.991985364578796,
 'accuracy': 0.9673715512968776,
 'precision': 0.7788461538461539,
 'mcc': 0.6189003703171713}

# Ensemble all of 10 models

MCC: 0.6477

In [10]:
from lib.utils import round_dict
import numpy as np
def generate_mean_ensemble_metrics_auto_threshold(df_valid, df_test=None, start=-3, end=1, step=0.1):
    thresholds = np.arange(start, end, step)
    mcc_values = []

    for threshold in thresholds:
        metrics = generate_mean_ensemble_metrics(df_valid, threshold=threshold)
        mcc_values.append(metrics['mcc'])
    
    best_threshold = thresholds[np.argmax(mcc_values)]

    test_metrics = generate_mean_ensemble_metrics(df_test, threshold=best_threshold) if df_test is not None else {}

    return round_dict({
        'best_threshold': best_threshold,
        'valid_mcc': np.max(mcc_values),
        **test_metrics
    }, 4)

generate_mean_ensemble_metrics_auto_threshold(df_valid, df_test)

{'best_threshold': -1.0,
 'valid_mcc': 0.6311,
 'sensitivity': 0.5646,
 'specificity': 0.9894,
 'accuracy': 0.9674,
 'precision': 0.7437,
 'mcc': 0.6316}

# Ensemble with 5 model of higher valid_mcc (Current method)

MCC: 0.6327

In [11]:
valid_mccs = []

for i in range(20):
    df_single_valid = df_valid[['target', f'pred_{i}']]
    df_single_test = df_test[['target', f'pred_{i}']]

    valid_mccs.append(generate_mean_ensemble_metrics_auto_threshold(df_single_valid)['valid_mcc'])


valid_mccs = np.array(valid_mccs)
valid_mccs

array([0.5869, 0.6068, 0.5925, 0.5921, 0.6022, 0.6017, 0.5919, 0.5925,
       0.6027, 0.5988, 0.5903, 0.593 , 0.5884, 0.5823, 0.5999, 0.5896,
       0.5952, 0.5956, 0.5896, 0.5883])

In [12]:
top_5_indices = np.argsort(valid_mccs)[-10:]
top_5_indices

array([ 7, 11, 16, 17,  9, 14,  5,  4,  8,  1])

In [13]:
df_highmcc_valid = df_valid[['target'] + ['pred_' + str(i) for i in top_5_indices]]
df_highmcc_test = df_test[['target'] + ['pred_' + str(i) for i in top_5_indices]]


generate_mean_ensemble_metrics_auto_threshold(df_highmcc_valid, df_highmcc_test)

{'best_threshold': -0.6,
 'valid_mcc': 0.6303,
 'sensitivity': 0.5423,
 'specificity': 0.9908,
 'accuracy': 0.9675,
 'precision': 0.7623,
 'mcc': 0.6271}

# Ensemble with 5 model set of maximal prediction difference

Does not work: this leads to lower mcc

In [8]:
from itertools import combinations
from scipy.spatial.distance import cosine
from scipy.special import expit

pairwise_diff = {}
thresholds = [generate_mean_ensemble_metrics_auto_threshold(df_valid[['target', f'pred_{i}']])['best_threshold'] for i in range(10)]
for i in range(10):
    for j in range(i+1, 10):
        pairwise_diff[(i, j)] = ((np.array(df_valid[f'pred_{i}']) > thresholds[i]) != (np.array(df_valid[f'pred_{j}']) > thresholds[j])).sum()
        pairwise_diff[(j, i)] = pairwise_diff[(i, j)]

In [9]:
pairwise_diff

{(0, 1): 251,
 (1, 0): 251,
 (0, 2): 218,
 (2, 0): 218,
 (0, 3): 232,
 (3, 0): 232,
 (0, 4): 250,
 (4, 0): 250,
 (0, 5): 242,
 (5, 0): 242,
 (0, 6): 206,
 (6, 0): 206,
 (0, 7): 227,
 (7, 0): 227,
 (0, 8): 240,
 (8, 0): 240,
 (0, 9): 244,
 (9, 0): 244,
 (1, 2): 205,
 (2, 1): 205,
 (1, 3): 221,
 (3, 1): 221,
 (1, 4): 279,
 (4, 1): 279,
 (1, 5): 241,
 (5, 1): 241,
 (1, 6): 243,
 (6, 1): 243,
 (1, 7): 236,
 (7, 1): 236,
 (1, 8): 199,
 (8, 1): 199,
 (1, 9): 241,
 (9, 1): 241,
 (2, 3): 208,
 (3, 2): 208,
 (2, 4): 252,
 (4, 2): 252,
 (2, 5): 232,
 (5, 2): 232,
 (2, 6): 214,
 (6, 2): 214,
 (2, 7): 233,
 (7, 2): 233,
 (2, 8): 202,
 (8, 2): 202,
 (2, 9): 248,
 (9, 2): 248,
 (3, 4): 270,
 (4, 3): 270,
 (3, 5): 246,
 (5, 3): 246,
 (3, 6): 228,
 (6, 3): 228,
 (3, 7): 227,
 (7, 3): 227,
 (3, 8): 212,
 (8, 3): 212,
 (3, 9): 246,
 (9, 3): 246,
 (4, 5): 276,
 (5, 4): 276,
 (4, 6): 262,
 (6, 4): 262,
 (4, 7): 281,
 (7, 4): 281,
 (4, 8): 272,
 (8, 4): 272,
 (4, 9): 306,
 (9, 4): 306,
 (5, 6): 238,
 (6, 5

In [10]:
from itertools import combinations
from scipy.special import comb as C
results = []
for comb in combinations(range(10), 5):
    diff_sum = 0
    for i in range(5):
        for j in range(i+1, 5):
            diff_sum += pairwise_diff[(comb[i], comb[j])]
    diff_sum /= C(5, 2)
    
    df_comb_valid = df_valid[['target'] + ['pred_' + str(i) for i in comb]]
    df_comb_test = df_test[['target'] + ['pred_' + str(i) for i in comb]]

    res = generate_mean_ensemble_metrics_auto_threshold(df_comb_valid, df_comb_test, start=-10, end=10, step=1)
    results.append((comb, diff_sum, res['valid_mcc'], res['mcc']))


results.sort(key=lambda x: x[1])


In [11]:
results[:10]

[((1, 2, 3, 6, 8), 215.0, 0.6692, 0.6274),
 ((0, 2, 3, 6, 8), 217.8, 0.6746, 0.6362),
 ((1, 2, 3, 7, 8), 218.4, 0.665, 0.6348),
 ((0, 1, 2, 3, 8), 218.8, 0.6686, 0.645),
 ((0, 1, 2, 6, 8), 219.6, 0.6741, 0.6405),
 ((1, 2, 3, 5, 8), 221.0, 0.6787, 0.6406),
 ((0, 1, 2, 3, 6), 222.6, 0.6758, 0.6324),
 ((2, 3, 6, 7, 8), 223.2, 0.6722, 0.6302),
 ((1, 2, 3, 8, 9), 223.4, 0.6678, 0.6367),
 ((1, 2, 5, 6, 8), 223.6, 0.6728, 0.6267)]

In [12]:
results[-10:]

[((3, 4, 6, 7, 9), 260.0, 0.6714, 0.629),
 ((1, 4, 5, 6, 9), 260.4, 0.6681, 0.6415),
 ((4, 6, 7, 8, 9), 261.2, 0.6761, 0.6285),
 ((1, 4, 6, 7, 9), 262.8, 0.6722, 0.6285),
 ((0, 4, 5, 7, 9), 263.6, 0.6806, 0.6395),
 ((2, 4, 5, 7, 9), 263.8, 0.6733, 0.6356),
 ((3, 4, 5, 7, 9), 266.2, 0.6719, 0.6325),
 ((1, 4, 5, 7, 9), 267.0, 0.6777, 0.6359),
 ((4, 5, 6, 7, 9), 267.4, 0.6802, 0.6355),
 ((4, 5, 7, 8, 9), 268.2, 0.6799, 0.6386)]

In [13]:
result_df = pd.DataFrame(results, columns=['combination', 'difference', 'valid_mcc', 'mcc'])

In [14]:
result_df

Unnamed: 0,combination,difference,valid_mcc,mcc
0,"(1, 2, 3, 6, 8)",215.0,0.6692,0.6274
1,"(0, 2, 3, 6, 8)",217.8,0.6746,0.6362
2,"(1, 2, 3, 7, 8)",218.4,0.6650,0.6348
3,"(0, 1, 2, 3, 8)",218.8,0.6686,0.6450
4,"(0, 1, 2, 6, 8)",219.6,0.6741,0.6405
...,...,...,...,...
247,"(2, 4, 5, 7, 9)",263.8,0.6733,0.6356
248,"(3, 4, 5, 7, 9)",266.2,0.6719,0.6325
249,"(1, 4, 5, 7, 9)",267.0,0.6777,0.6359
250,"(4, 5, 6, 7, 9)",267.4,0.6802,0.6355


In [18]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(result_df['difference'].values.reshape(-1,1), result_df['mcc'].values)

In [19]:
reg.coef_

array([3.51267265e-06])

In [20]:
reg.score(result_df['valid_mcc'].values.reshape(-1,1), result_df['mcc'].values)

-0.017854048910808773