# MBC (Rose) effect on NMI data

## CSV preprocessing and method selection

In [35]:
import pandas
from copy import deepcopy
import numpy
import math
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [36]:
numpy.seterr(all='raise')
numpy.geterr()

{'divide': 'raise', 'over': 'raise', 'under': 'raise', 'invalid': 'raise'}

In [37]:
ssym_df = pandas.read_csv("ssym_analysis.csv")
for _ in ssym_df.columns:
    if str(_).startswith("runtime"):
        ssym_df.drop(_, axis=1, inplace=True)

In [38]:
print(ssym_df.shape)
list(ssym_df.columns)

(684, 132)


['uid',
 'direction',
 'uid2',
 'ACDC-NN',
 'ACDC-NN-Seq',
 'Cartddg',
 'DDG',
 'DDGun',
 'DDGun3D',
 'DUET',
 'Dynamut',
 'Dynamut2',
 'Evo',
 'FoldX',
 'I-Mutant3.0',
 'I-Mutant3.0-Seq',
 'INPS-Seq',
 'INPS3D',
 'KORPM',
 'MAESTRO',
 'MUpro',
 'PoPMuSiC',
 'PremPS',
 'SAAFEC-SEQ',
 'SDM',
 'ThermoNet',
 'ankh',
 'cartesian_ddg',
 'cluster',
 'ddG',
 'delta_kdh',
 'delta_vol',
 'esm1v_1',
 'esm1v_2',
 'esm1v_3',
 'esm1v_4',
 'esm1v_5',
 'esm1v_mean',
 'esm1v_median',
 'esm2_150M',
 'esm2_15B_half',
 'esm2_3B',
 'esm2_650M',
 'esmif_multimer',
 'korpm',
 'mCSM',
 'mif',
 'mifst',
 'mpnn_10_00',
 'mpnn_20_00',
 'mpnn_30_00',
 'msa_1',
 'msa_2',
 'msa_3',
 'msa_4',
 'msa_5',
 'msa_transformer_mean',
 'msa_transformer_median',
 'mutcomputex',
 'pll_esmif_multimer',
 'rel_ASA',
 'stability-oracle',
 'tranception',
 'tranception_weights',
 'code',
 'chain',
 'wild_type',
 'position',
 'mutation',
 'offset_up',
 'uniprot_seq',
 'reduced_msa_file',
 'full_msa_file',
 'MUT',
 'Dyna2',
 'PopMs'

### Selected methods

In [39]:
methods = [
 'ACDC-NN',
 'ACDC-NN-Seq',
 'Cartddg',
 'DDGun3D',
 'DUET',
 'Dynamut',
 'Dynamut2',
 'Evo',
 'FoldX',
 'I-Mutant3.0',
 'I-Mutant3.0-Seq',
 'INPS-Seq',
 'INPS3D',
 'KORPM',
 'MAESTRO',
 'MUpro',
 'PoPMuSiC',
 'PremPS',
 'SAAFEC-SEQ',
 'SDM',
 'ThermoNet',
 'ankh',
 'cartesian_ddg',
 'esm1v_1',
 'esm1v_2',
 'esm1v_3',
 'esm1v_4',
 'esm1v_5',
 'esm1v_mean',
 'esm1v_median',
 'esm2_150M',
 'esm2_15B_half',
 'esm2_3B',
 'esm2_650M',
 'esmif_multimer',
 'mCSM',
 'mif',
 'mifst',
 'mpnn_10_00',
 'mpnn_20_00',
 'mpnn_30_00',
 'msa_transformer_mean',
 'msa_transformer_median',
 'mutcomputex',
 'stability-oracle',
 'tranception',
 'tranception_weights',
]

In [40]:
method_types = {
 # 'uid',
 # 'direction',
 # 'uid2',
 'ACDC-NN': 'transfer',
 'ACDC-NN-Seq': 'transfer',
 'Cartddg': 'biophysical',
 #'DDG',
 #'DDGun',
 'DDGun3D':'untrained',
 'DUET':'supervised',
 'Dynamut':'other',
 'Dynamut2':'other',
 'Evo': 'biophysical',
 'FoldX': 'biophysical',
 'I-Mutant3.0':'supervised',
 'I-Mutant3.0-Seq':'supervised',
 'INPS-Seq':'supervised',
 'INPS3D':'supervised',
 'KORPM': 'biophysical',
 'MAESTRO':'supervised',
 'MUpro':'supervised',
 'PoPMuSiC':'potential',
 'PremPS':'supervised',
 'SAAFEC-SEQ':'supervised',
 'SDM':'potential',
 'ThermoNet':'supervised',
 'ankh':'seq.PLM',
 'cartesian_ddg':'biophysical',
 'esm1v_1':'seq.PLM',
 'esm1v_2':'seq.PLM',
 'esm1v_3':'seq.PLM',
 'esm1v_4':'seq.PLM',
 'esm1v_5':'seq.PLM',
 'esm1v_mean':'seq.PLM',
 'esm1v_median':'seq.PLM',
 'esm2_150M':'seq.PLM',
 'esm2_15B_half':'seq.PLM',
 'esm2_3B':'seq.PLM',
 'esm2_650M':'seq.PLM',
 'esmif_multimer':'struct.PLM',
 'mCSM':'supervised',
 'mif':'struct.PLM',
 'mifst':'struct.PLM',
 'mpnn_10_00':'struct.PLM',
 'mpnn_20_00':'struct.PLM',
 'mpnn_30_00':'struct.PLM',
 'msa_transformer_mean':'seq.PLM',
 'msa_transformer_median':'seq.PLM',
 'mutcomputex':'struct.PLM',
 'stability-oracle':'transfer',
 'tranception':'seq. PLM',
 'tranception_weights':'seq. PLM',
}

In [41]:
labelz=pandas.read_csv('labelz.csv',index_col='Method')
labelz

Unnamed: 0_level_0,type,mass_balance
Method,Unnamed: 1_level_1,Unnamed: 2_level_1
ACDC-NN,transfer,y
ACDC-NN-Seq,transfer,y
Cartddg,biophysical,y
DDGun3D,untrained,y
DUET,supervised,y
Dynamut,other,y
Dynamut2,other,y
Evo,biophysical,n
FoldX,biophysical,n
I-Mutant3.0,supervised,y


In [42]:
ssym_df[methods].isna().sum()

ACDC-NN                   0
ACDC-NN-Seq               0
Cartddg                   0
DDGun3D                   0
DUET                      0
Dynamut                   0
Dynamut2                  0
Evo                       0
FoldX                     0
I-Mutant3.0               2
I-Mutant3.0-Seq           2
INPS-Seq                  0
INPS3D                    0
KORPM                     0
MAESTRO                   0
MUpro                     0
PoPMuSiC                  0
PremPS                    0
SAAFEC-SEQ                0
SDM                       0
ThermoNet                 6
ankh                      0
cartesian_ddg             0
esm1v_1                   0
esm1v_2                   0
esm1v_3                   0
esm1v_4                   0
esm1v_5                   0
esm1v_mean                0
esm1v_median              0
esm2_150M                 0
esm2_15B_half             0
esm2_3B                   0
esm2_650M                 0
esmif_multimer            0
mCSM                

In [43]:
select_df = ssym_df[
    ["uid", "direction", "uid2", "DDG", "chain","wild_type", "position", "mutation"] + methods
].dropna()
# select_df.dropna(inplace=True)
select_df

Unnamed: 0,uid,direction,uid2,DDG,chain,wild_type,position,mutation,ACDC-NN,ACDC-NN-Seq,...,mifst,mpnn_10_00,mpnn_20_00,mpnn_30_00,msa_transformer_mean,msa_transformer_median,mutcomputex,stability-oracle,tranception,tranception_weights
0,1AMQ_191F,dir,1AMQ_180F,-1.6,A,C,180,F,0.155387,0.000000,...,-15.833597,-4.753813,-5.319994,-4.230467,-11.351413,-11.507070,-0.072907,-0.365,-0.030819,-0.029796
1,1AMQ_191F,inv,1AMQ_180F,1.6,A,C,180,F,-0.179437,0.000000,...,11.865127,4.274536,5.143872,4.086548,11.351413,11.507070,0.010589,0.110,0.030819,0.029796
2,1AMQ_191S,dir,1AMQ_180S,-1.9,A,C,180,S,-0.310174,0.000000,...,-7.009702,-0.511659,-2.171657,-0.905479,-5.473701,-6.148736,0.152491,-0.009,-0.015818,-0.015382
3,1AMQ_191S,inv,1AMQ_180S,1.9,A,C,180,S,0.150363,0.000000,...,7.094869,-0.156843,1.406749,0.149925,5.473701,6.148736,-0.109291,0.108,0.015818,0.015382
4,1AMQ_191W,dir,1AMQ_180W,-3.9,A,C,180,W,0.175478,0.000235,...,-16.822426,-4.945947,-5.441107,-4.408648,-13.321842,-13.401335,-0.072507,-0.616,-0.035233,-0.037848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,5PTI_35G,inv,5PTI_35G,5.0,A,Y,35,G,2.394038,3.046156,...,1.420484,-5.640223,-5.485600,-5.337170,10.376691,10.705666,-0.999981,-1.087,0.091949,0.097295
680,5PTI_43G,dir,5PTI_43G,-5.7,A,N,43,G,-1.785572,-1.548846,...,-13.269845,-5.804597,-5.387895,-5.760532,-9.403118,-9.475676,-0.957911,-0.303,-0.055134,-0.062423
681,5PTI_43G,inv,5PTI_43G,5.7,A,N,43,G,1.418397,1.558716,...,10.233284,5.155628,5.263233,4.256405,9.403118,9.475676,-0.999996,-0.070,0.055134,0.062423
682,5PTI_45A,dir,5PTI_45A,-6.9,A,F,45,A,-3.439780,-3.219905,...,-12.405282,-5.616822,-5.456846,-5.437554,-12.561387,-13.084740,-0.915575,-0.849,-0.100102,-0.089689


In [44]:
train_rez = pandas.DataFrame()
for _ in ['pearson','spearman']:
    train_rez['plain_' + _ ]= select_df[methods].corrwith(select_df['DDG'],method=_)
train_rez['plain_RMSE']= [ math.sqrt(mean_squared_error(select_df['DDG'],select_df[_m])) for _m in methods ]
#train_rez
train_rez.sort_values('plain_pearson',ascending=False)
#train_rez.sort_values('plain_RMSE',ascending=True)

Unnamed: 0,plain_pearson,plain_spearman,plain_RMSE
PremPS,0.847589,0.834067,1.046586
KORPM,0.696388,0.725962,1.338914
ACDC-NN,0.69389,0.626436,1.393952
ACDC-NN-Seq,0.686184,0.572426,1.389011
mpnn_30_00,0.669151,0.709083,2.279188
mpnn_20_00,0.657558,0.693754,2.420931
DDGun3D,0.649913,0.548728,1.40775
msa_transformer_mean,0.643642,0.634182,5.3422
msa_transformer_median,0.643297,0.631633,5.376427
mifst,0.64059,0.631589,4.822427


### S461

In [45]:
s461_df = pandas.read_csv("s461_relabeled.csv")
for _ in s461_df.columns:
    if str(_).startswith("runtime"):
        s461_df.drop(_, axis=1, inplace=True)
s461_df.shape

(461, 168)

In [46]:
#'rel_ASA_dir'.rsplit('_',maxsplit=1)
#list(s461_df.columns)

In [47]:
s461_dir= [ _.rsplit('_', maxsplit=1)[0] for _ in s461_df.columns if _.endswith('dir')]
s461_dir

['PDB',
 'MUT_D',
 'ddG',
 'KORPM',
 'Cartddg',
 'FoldXD',
 'Evo',
 'Dynamut2',
 'PoPMuSiC',
 'DDGunD',
 'TNetD',
 'ACDCNND',
 'DDG_checked',
 'MAESTRO',
 'FoldX',
 'PremPS',
 'Dynamut',
 'mCSM',
 'SDM',
 'DUET',
 'I-Mutant3.0',
 'I-Mutant3.0-Seq',
 'MUpro',
 'SAAFEC-SEQ',
 'DDGun3D',
 'DDGun',
 'ACDC-NN-Seq',
 'ACDC-NN',
 'INPS-Seq',
 'INPS3D',
 'PopMusic',
 'SOL_ACC',
 'SEC_STR',
 'ThermoNet',
 'msa_1',
 'msa_2',
 'msa_3',
 'msa_4',
 'msa_5',
 'msa_transformer_median',
 'msa_transformer_mean',
 'esmif_multimer',
 'pll_esmif_multimer',
 'esm2_3B',
 'esm2_15B_half',
 'mpnn_10_00',
 'mpnn_20_00',
 'mpnn_30_00',
 'mif',
 'mifst',
 'esm2_150M',
 'esm2_650M',
 'esm1v_1',
 'esm1v_2',
 'esm1v_3',
 'esm1v_4',
 'esm1v_5',
 'esm1v_median',
 'esm1v_mean',
 'tranception',
 'mutcomputex',
 'tranception_weights',
 'korpm_unk',
 'ankh',
 'stability-oracle',
 'cartesian_ddg',
 'on_interface',
 'rel_ASA',
 'delta_kdh',
 'delta_vol',
 'delta_chg',
 'to_proline',
 'to_glycine',
 'to_alanine',
 'from_pro

In [48]:
assert not [ _ for _ in methods if _ not in s461_dir ]

In [49]:
s461_df.head()

Unnamed: 0,uid,uid2,PDB_dir,MUT_D_dir,ddG_dir,KORPM_dir,Cartddg_dir,FoldXD_dir,Evo_dir,Dynamut2_dir,...,K1566_pslm_rfa_2_dir,K1566_pslm_rfa_3_dir,K1566_pslm_rfa_4_dir,K1566_pslm_rfa_5_dir,K1566_pslm_rfa_6_dir,K1566_pslm_rfa_7_dir,K1566_pslm_rfa_8_dir,K1566_pslm_rfa_9_dir,K1566_pslm_rfa_10_dir,K1566_pslm_rfa_11_dir
0,1A0F_11A,1A0F_11A,1A0F,SA11A,-1.8,0.138,1.45,0.48,-0.37,0.545,...,-1.029995,-0.848069,-0.68482,-0.920688,-0.923177,-0.885503,-0.820124,-0.820124,-0.820124,-0.81938
1,1BA3_461D,1BA3_457D,1BA3,HA461D,-1.745,-1.065,-4.47,-0.86,-0.77,0.16,...,-0.783987,-0.923162,-1.061914,-1.157944,-1.173859,-1.132453,-1.122098,-1.122098,-1.122098,-1.154978
2,1BA3_489D,1BA3_485D,1BA3,HA489D,0.287,0.27,0.64,-0.13,0.96,-0.193,...,0.242109,0.125597,0.349911,0.696465,0.708697,0.696808,0.725284,0.725284,0.725284,0.710356
3,1BA3_489K,1BA3_485K,1BA3,HA489K,-0.287,0.398,0.86,0.19,1.12,0.19,...,0.753621,0.66756,1.037881,1.354656,1.390795,1.36418,1.402421,1.402421,1.402421,1.401294
4,1BA3_489M,1BA3_485M,1BA3,HA489M,-0.263,-0.211,-1.08,0.3,1.42,-0.138,...,-0.005739,0.458574,0.606183,0.925346,0.951895,0.958617,0.979611,0.979611,0.979611,0.950914


In [50]:
#s461_df[['KORPM_dir','korpm_dir']].corr()

In [51]:
#select_461_df = s461_df[    ["uid", "direction", "uid2", "DDG", "wild_type", "position", "mutation"] + methods]
#select_461_df.head() , select_461_df.shape

# fit ddMBC/Rose

In [52]:
aa_codes = [
    str.strip(_)
    for _ in "A, R, N, D, C, E, Q, G, H, I, L, K, M, F, P, S, T, W, Y, V".split(",")
]
fit_df = deepcopy(select_df)
#for _ in aa_codes:    fit_df[_] = 0
fit_df.head()

Unnamed: 0,uid,direction,uid2,DDG,chain,wild_type,position,mutation,ACDC-NN,ACDC-NN-Seq,...,mifst,mpnn_10_00,mpnn_20_00,mpnn_30_00,msa_transformer_mean,msa_transformer_median,mutcomputex,stability-oracle,tranception,tranception_weights
0,1AMQ_191F,dir,1AMQ_180F,-1.6,A,C,180,F,0.155387,0.0,...,-15.833597,-4.753813,-5.319994,-4.230467,-11.351413,-11.50707,-0.072907,-0.365,-0.030819,-0.029796
1,1AMQ_191F,inv,1AMQ_180F,1.6,A,C,180,F,-0.179437,0.0,...,11.865127,4.274536,5.143872,4.086548,11.351413,11.50707,0.010589,0.11,0.030819,0.029796
2,1AMQ_191S,dir,1AMQ_180S,-1.9,A,C,180,S,-0.310174,0.0,...,-7.009702,-0.511659,-2.171657,-0.905479,-5.473701,-6.148736,0.152491,-0.009,-0.015818,-0.015382
3,1AMQ_191S,inv,1AMQ_180S,1.9,A,C,180,S,0.150363,0.0,...,7.094869,-0.156843,1.406749,0.149925,5.473701,6.148736,-0.109291,0.108,0.015818,0.015382
4,1AMQ_191W,dir,1AMQ_180W,-3.9,A,C,180,W,0.175478,0.000235,...,-16.822426,-4.945947,-5.441107,-4.408648,-13.321842,-13.401335,-0.072507,-0.616,-0.035233,-0.037848


In [53]:
def encode_mutation(x):
    match x["direction"]:
        case "dir":
            x[x["wild_type"]] -= 1
            x[x["mutation"]] += 1
            return x
        case "inv":
            x[x["wild_type"]] += 1
            x[x["mutation"]] -= 1
            return x

In [54]:
#fit_df = fit_df.apply(encode_mutation, axis=1)
#fit_df

In [55]:
rose_df=pandas.read_csv('rose1985.csv',index_col='Parameter')
rose_df

Unnamed: 0_level_0,Rose1985
Parameter,Unnamed: 1_level_1
A,86.6
C,132.3
D,97.8
E,113.9
F,194.1
G,62.9
H,155.8
I,158.0
K,115.5
L,164.1


In [56]:
# esmif_df.head()
def rose_delta(x):
    _delta = rose_df["Rose1985"][x["wild_type"]] - rose_df["Rose1985"][x["mutation"]]
    x["rose_delta"] = _delta if x["direction"] == "dir" else -_delta
    return x

In [57]:
fit_df = fit_df.apply(rose_delta, 
               axis=1
              )
fit_df

Unnamed: 0,uid,direction,uid2,DDG,chain,wild_type,position,mutation,ACDC-NN,ACDC-NN-Seq,...,mpnn_10_00,mpnn_20_00,mpnn_30_00,msa_transformer_mean,msa_transformer_median,mutcomputex,stability-oracle,tranception,tranception_weights,rose_delta
0,1AMQ_191F,dir,1AMQ_180F,-1.6,A,C,180,F,0.155387,0.000000,...,-4.753813,-5.319994,-4.230467,-11.351413,-11.507070,-0.072907,-0.365,-0.030819,-0.029796,-61.8
1,1AMQ_191F,inv,1AMQ_180F,1.6,A,C,180,F,-0.179437,0.000000,...,4.274536,5.143872,4.086548,11.351413,11.507070,0.010589,0.110,0.030819,0.029796,61.8
2,1AMQ_191S,dir,1AMQ_180S,-1.9,A,C,180,S,-0.310174,0.000000,...,-0.511659,-2.171657,-0.905479,-5.473701,-6.148736,0.152491,-0.009,-0.015818,-0.015382,46.7
3,1AMQ_191S,inv,1AMQ_180S,1.9,A,C,180,S,0.150363,0.000000,...,-0.156843,1.406749,0.149925,5.473701,6.148736,-0.109291,0.108,0.015818,0.015382,-46.7
4,1AMQ_191W,dir,1AMQ_180W,-3.9,A,C,180,W,0.175478,0.000235,...,-4.945947,-5.441107,-4.408648,-13.321842,-13.401335,-0.072507,-0.616,-0.035233,-0.037848,-92.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,5PTI_35G,inv,5PTI_35G,5.0,A,Y,35,G,2.394038,3.046156,...,-5.640223,-5.485600,-5.337170,10.376691,10.705666,-0.999981,-1.087,0.091949,0.097295,-114.8
680,5PTI_43G,dir,5PTI_43G,-5.7,A,N,43,G,-1.785572,-1.548846,...,-5.804597,-5.387895,-5.760532,-9.403118,-9.475676,-0.957911,-0.303,-0.055134,-0.062423,40.4
681,5PTI_43G,inv,5PTI_43G,5.7,A,N,43,G,1.418397,1.558716,...,5.155628,5.263233,4.256405,9.403118,9.475676,-0.999996,-0.070,0.055134,0.062423,-40.4
682,5PTI_45A,dir,5PTI_45A,-6.9,A,F,45,A,-3.439780,-3.219905,...,-5.616822,-5.456846,-5.437554,-12.561387,-13.084740,-0.915575,-0.849,-0.100102,-0.089689,107.5


### predict train

In [58]:
#_mods = methods #["cartesian_ddg"]

mods = {}
X_d = {}
train_score = pandas.DataFrame(index=methods)
for _ in methods:
    X_d[_] = fit_df[[_,'rose_delta']].to_numpy()
    y = fit_df["DDG"].to_numpy()
    mods[_] = LinearRegression(fit_intercept=False).fit(X_d[_], y)
    #mods_no_q[_] = LinearRegression(fit_intercept=False).fit(X_d[_], y) 
    #print(        _, mods[_].intercept_, math.sqrt(mods[_].score(X_d[_], y)), models[_].coef_,    )

# ricorda: il problema è _X 
train_score['Fit_score'] = { _ :  math.sqrt(mods[_].score(X_d[_], y)) for _ in mods.keys()}
#train_score['no_Q'] = { _ :  math.sqrt(mods_no_q[_].score(X_d[_], y)) for _ in mods_no_q.keys()}
train_score.sort_values('Fit_score', ascending=False)

Unnamed: 0,Fit_score
PremPS,0.848402
mpnn_30_00,0.707576
KORPM,0.699536
ACDC-NN,0.696509
mpnn_20_00,0.696192
msa_transformer_mean,0.690314
ACDC-NN-Seq,0.689721
msa_transformer_median,0.689638
mifst,0.686558
DDGun3D,0.66229


In [59]:
train_pred=pandas.DataFrame()
for _ in methods:
    train_pred[_] = mods[_].predict(X_d[_])

for _ in ['pearson','spearman']:
    train_rez['ddmbc_rose_' +  _ ] = train_pred.corrwith(select_df['DDG'],method=_)

train_rez['ddmbc_rose_RMSE']= [ math.sqrt(mean_squared_error(select_df['DDG'],train_pred[_m])) for _m in methods]

train_rez.sort_values('plain_pearson',ascending=False)


Unnamed: 0,plain_pearson,plain_spearman,plain_RMSE,ddmbc_rose_pearson,ddmbc_rose_spearman,ddmbc_rose_RMSE
PremPS,0.847589,0.834067,1.046586,0.416583,0.506293,0.980401
KORPM,0.696388,0.725962,1.338914,0.429484,0.50326,1.323487
ACDC-NN,0.69389,0.626436,1.393952,0.329454,0.415291,1.328953
ACDC-NN-Seq,0.686184,0.572426,1.389011,0.337368,0.426674,1.341041
mpnn_30_00,0.669151,0.709083,2.279188,0.470028,0.527657,1.308746
mpnn_20_00,0.657558,0.693754,2.420931,0.441569,0.508794,1.329522
DDGun3D,0.649913,0.548728,1.40775,0.346451,0.399053,1.387662
msa_transformer_mean,0.643642,0.634182,5.3422,0.4406,0.483173,1.339995
msa_transformer_median,0.643297,0.631633,5.376427,0.440168,0.485668,1.341187
mifst,0.64059,0.631589,4.822427,0.416866,0.470027,1.346597


In [60]:
mods['mutcomputex'].coef_

array([ 0.96109457, -0.01455722])

### crossvalidation

In [61]:
# cv
_cv = pandas.DataFrame(index=["CV_mean", "CV_std"])
for _ in methods:
    fit_rsq = cross_val_score(
        mods[_],
        X_d[_],
        y,
        cv=RepeatedStratifiedKFold(random_state=2411,
            n_splits=5,
            n_repeats=10,
            # shuffle=True,
        ).split(X_d[_], y > 0),
    )
    # fit_rsq
    try:
        fit_rsq_no_nan =  [ _score for _score in fit_rsq if _score > 0.0] #fit_rsq[~numpy.isnan(fit_rsq)]
        _r = numpy.sqrt(fit_rsq_no_nan)
        _cv[_] = [_r.mean(), _r.std()]
    except FloatingPointError:
        print(_, fit_rsq_no_nan)
    

In [62]:
train_score = pandas.concat(
    [
        train_score["Fit_score"],
        train_rez["ddmbc_rose_RMSE"],
        _cv.T,
    ],
    axis=1,
)
train_score.sort_values("ddmbc_rose_RMSE")

Unnamed: 0,Fit_score,ddmbc_rose_RMSE,CV_mean,CV_std
PremPS,0.848402,0.980401,0.844373,0.025981
mpnn_30_00,0.707576,1.308746,0.698803,0.043831
KORPM,0.699536,1.323487,0.691018,0.04361
ACDC-NN,0.696509,1.328953,0.6853,0.05728
mpnn_20_00,0.696192,1.329522,0.68582,0.051165
msa_transformer_mean,0.690314,1.339995,0.677203,0.062658
ACDC-NN-Seq,0.689721,1.341041,0.678593,0.059415
msa_transformer_median,0.689638,1.341187,0.67637,0.063117
mifst,0.686558,1.346597,0.674867,0.058753
DDGun3D,0.66229,1.387662,0.649932,0.065831


### fit parameters

In [63]:
params_df=pandas.DataFrame(index=['Delta','ddmbc_Rose'])
for _ in methods:
    params_df[_]= mods[_].coef_
params_df.T

Unnamed: 0,Delta,ddmbc_Rose
ACDC-NN,1.560615,0.003236
ACDC-NN-Seq,1.474562,0.003752
Cartddg,0.234815,-0.007416
DDGun3D,1.18088,0.007607
DUET,0.358874,-0.011749
Dynamut,0.801455,-0.005366
Dynamut2,0.316419,-0.013225
Evo,0.800364,-0.001088
FoldX,0.36903,-0.012697
I-Mutant3.0,0.314322,-0.01282


## Test on 461

In [64]:
select_461_df = deepcopy(
    s461_df[
        ["uid", "code", "wild_type", "position", "mutation", "ddG_dir", "ddG_inv"]
        + ["_".join([_m, "dir"]) for _m in methods]
        # ['_'.join([_m,_]) for _m in methods for _ in ['dir','inv'] ]
    ]
)
select_461_df['direction']='dir'
#for _ in aa_codes:    select_461_df[_]=0
test_461_df=select_461_df.apply(rose_delta,axis=1)
test_461_df

Unnamed: 0,uid,code,wild_type,position,mutation,ddG_dir,ddG_inv,ACDC-NN_dir,ACDC-NN-Seq_dir,Cartddg_dir,...,mpnn_20_00_dir,mpnn_30_00_dir,msa_transformer_mean_dir,msa_transformer_median_dir,mutcomputex_dir,stability-oracle_dir,tranception_dir,tranception_weights_dir,direction,rose_delta
0,1A0F_11A,1A0F,S,11,A,-1.800,1.800,0.225406,0.041723,1.45,...,-2.593541,-3.121864,-2.473579,-2.307158,-0.906813,-0.103,-0.018765,-0.018609,dir,-1.0
1,1BA3_461D,1BA3,H,457,D,-1.745,1.745,-0.230352,-0.447964,-4.47,...,-3.647010,-2.449762,-5.515070,-5.353736,-0.499039,-0.950,-0.007958,-0.008126,dir,58.0
2,1BA3_489D,1BA3,H,485,D,0.287,-0.287,-0.040138,-0.000271,0.64,...,1.273386,1.350624,0.514193,0.590200,0.029220,-0.040,0.003863,0.003940,dir,58.0
3,1BA3_489K,1BA3,H,485,K,-0.287,0.287,0.201055,0.006583,0.86,...,2.620539,2.906657,3.111199,3.013958,0.121793,0.025,0.003370,0.003423,dir,40.3
4,1BA3_489M,1BA3,H,485,M,-0.263,0.263,0.230963,0.079617,-1.08,...,-0.604758,-0.753108,-0.890877,-0.846117,-0.010539,-0.065,-0.002460,-0.002355,dir,-17.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,4HE7_19G,4HE7,A,19,G,0.060,-0.060,-0.728433,-0.914432,-2.20,...,-1.105547,-1.041088,1.028863,1.028863,-0.037434,-0.155,0.021470,0.006001,dir,23.7
457,4HE7_19K,4HE7,A,19,K,-0.460,0.460,-0.345831,0.026103,0.59,...,0.482251,1.065889,0.065515,0.065515,0.039233,-0.046,0.023775,-0.000700,dir,-28.9
458,5JXB_329G,5JXB,D,25,G,-1.440,1.440,-0.134107,-0.000839,-4.72,...,-0.347571,-0.981272,-8.707702,-8.724204,-0.177725,0.026,-0.008806,-0.007908,dir,34.9
459,5JXB_329P,5JXB,D,25,P,-1.440,1.440,0.134216,0.000536,-8.00,...,-3.173122,-3.122738,-13.558142,-13.668578,-0.171720,-0.164,-0.017972,-0.017719,dir,4.9


### predict test

In [65]:
for _ in methods:
    print( sum(test_461_df[_ + "_dir"].notna()), _ )     
test_461_df.dropna(inplace=True, ignore_index=True)
test_461_df.shape

461 ACDC-NN
461 ACDC-NN-Seq
461 Cartddg
461 DDGun3D
461 DUET
461 Dynamut
461 Dynamut2
461 Evo
461 FoldX
461 I-Mutant3.0
461 I-Mutant3.0-Seq
461 INPS-Seq
461 INPS3D
461 KORPM
461 MAESTRO
461 MUpro
461 PoPMuSiC
461 PremPS
461 SAAFEC-SEQ
461 SDM
461 ThermoNet
461 ankh
461 cartesian_ddg
461 esm1v_1
461 esm1v_2
461 esm1v_3
461 esm1v_4
461 esm1v_5
461 esm1v_mean
461 esm1v_median
461 esm2_150M
461 esm2_15B_half
461 esm2_3B
461 esm2_650M
461 esmif_multimer
461 mCSM
461 mif
461 mifst
461 mpnn_10_00
461 mpnn_20_00
461 mpnn_30_00
461 msa_transformer_mean
461 msa_transformer_median
461 mutcomputex
460 stability-oracle
461 tranception
461 tranception_weights


(460, 56)

In [66]:
plain_perf=deepcopy(test_461_df[[_om + '_dir' for _om in methods ]+['rose_delta']]).rename({ _+'_dir' : _ for _ in methods}, axis=1)
plain_perf
#plain_perf.corrwith(test_461_df['ddG_dir'],method='pearson')

Unnamed: 0,ACDC-NN,ACDC-NN-Seq,Cartddg,DDGun3D,DUET,Dynamut,Dynamut2,Evo,FoldX,I-Mutant3.0,...,mpnn_10_00,mpnn_20_00,mpnn_30_00,msa_transformer_mean,msa_transformer_median,mutcomputex,stability-oracle,tranception,tranception_weights,rose_delta
0,0.225406,0.041723,1.45,0.5,-0.466,0.545,0.545,-0.37,0.551703,-0.90,...,-0.958551,-2.593541,-3.121864,-2.473579,-2.307158,-0.906813,-0.103,-0.018765,-0.018609,-1.0
1,-0.230352,-0.447964,-4.47,-1.2,-1.757,0.160,0.160,-0.77,-0.998639,-1.07,...,-4.696467,-3.647010,-2.449762,-5.515070,-5.353736,-0.499039,-0.950,-0.007958,-0.008126,58.0
2,-0.040138,-0.000271,0.64,-0.1,-0.252,-0.193,-0.193,0.96,-0.119835,0.12,...,1.392470,1.273386,1.350624,0.514193,0.590200,0.029220,-0.040,0.003863,0.003940,58.0
3,0.201055,0.006583,0.86,-0.1,0.358,0.190,0.190,1.12,0.131499,-0.08,...,2.524363,2.620539,2.906657,3.111199,3.013958,0.121793,0.025,0.003370,0.003423,40.3
4,0.230963,0.079617,-1.08,0.0,0.287,-0.138,-0.138,1.42,0.277350,0.24,...,-0.910601,-0.604758,-0.753108,-0.890877,-0.846117,-0.010539,-0.065,-0.002460,-0.002355,-17.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,-0.728433,-0.914432,-2.20,-0.4,-0.021,-0.133,-0.133,-0.71,0.031963,-0.90,...,-1.571091,-1.105547,-1.041088,1.028863,1.028863,-0.037434,-0.155,0.021470,0.006001,23.7
456,-0.345831,0.026103,0.59,-0.3,-0.518,-0.119,-0.119,0.18,0.296079,-0.46,...,0.625590,0.482251,1.065889,0.065515,0.065515,0.039233,-0.046,0.023775,-0.000700,-28.9
457,-0.134107,-0.000839,-4.72,-0.3,0.254,0.694,0.694,-0.53,-0.465159,-0.24,...,-1.060423,-0.347571,-0.981272,-8.707702,-8.724204,-0.177725,0.026,-0.008806,-0.007908,34.9
458,0.134216,0.000536,-8.00,-0.2,0.362,0.390,0.390,0.42,-0.778776,-0.06,...,-3.308088,-3.173122,-3.122738,-13.558142,-13.668578,-0.171720,-0.164,-0.017972,-0.017719,4.9


## Summary table

In [70]:
# get pythia data
pythia_nmi=pandas.read_csv('pythia_nmi.csv',index_col=[0])
pythia_nmi

Unnamed: 0,plain_pearson,test_pearson,delta_pearson,plain_RMSE,test_RMSE,delta_RMSE,type,mass_balance
pythia,0.425031,0.616999,0.191968,7.390213,1.216855,-6.173358,struct.PLM,n


In [71]:
test_pred=pandas.DataFrame()
test_rez= pandas.DataFrame(index=methods)
for _ in methods:
    X_t =  plain_perf[[_,'rose_delta']].to_numpy() # test_461_df[[_+'_dir']+ aa_codes].to_numpy()
    test_pred[_] = mods[_].predict(X_t)
    
    #for _ in ['pearson','spearman']:
    for _ in ['pearson']:
        test_rez['test_' +  _ ] = test_pred.corrwith(test_461_df['ddG_dir'],method=_)
        test_rez['plain_' +  _ ] = plain_perf.corrwith(test_461_df['ddG_dir'],method=_)
        test_rez['delta_' +  _ ] = test_rez['test_'+_] - test_rez['plain_'+_]

test_rez['plain_RMSE']= [ math.sqrt(mean_squared_error(test_461_df['ddG_dir'],plain_perf[_m])) for _m in methods]
test_rez['test_RMSE']= [ math.sqrt(mean_squared_error(test_461_df['ddG_dir'],test_pred[_m])) for _m in methods]
test_rez['delta_RMSE'] = test_rez['plain_RMSE'] - test_rez['test_RMSE']

#test_rez['type']= [method_types[_] for _ in test_rez.index]a
test_rez = test_rez.merge(labelz,left_index=True,right_index=True)
test_rez = pandas.concat([pythia_nmi,test_rez])
test_rez

Unnamed: 0,plain_pearson,test_pearson,delta_pearson,plain_RMSE,test_RMSE,delta_RMSE,type,mass_balance
pythia,0.425031,0.616999,0.191968,7.390213,1.216855,-6.173358,struct.PLM,n
ACDC-NN,0.604194,0.602964,-0.00123,1.06523,1.185123,-0.119893,transfer,y
ACDC-NN-Seq,0.570004,0.565298,-0.004706,1.101188,1.218214,-0.117026,transfer,y
Cartddg,0.596848,0.615422,0.018574,3.584026,1.003931,2.580095,biophysical,y
DDGun3D,0.634511,0.628999,-0.005512,1.104507,1.135288,-0.030781,untrained,y
DUET,0.593102,0.530999,-0.062103,1.060548,1.180457,-0.119909,supervised,y
Dynamut,0.504483,0.517453,0.01297,1.267236,1.239396,0.02784,other,y
Dynamut2,0.504483,0.472093,-0.03239,1.267236,1.293028,-0.025792,other,y
Evo,0.463202,0.468295,0.005093,1.27358,1.261779,0.011801,biophysical,n
FoldX,0.221282,0.398879,0.177597,2.229436,1.336322,0.893114,biophysical,n


### Rank by Correlation increase 

In [72]:
test_rez.sort_values( 'delta_pearson',ascending=False) #'test_RMSE')

Unnamed: 0,plain_pearson,test_pearson,delta_pearson,plain_RMSE,test_RMSE,delta_RMSE,type,mass_balance
pythia,0.425031,0.616999,0.191968,7.390213,1.216855,-6.173358,struct.PLM,n
mpnn_10_00,0.339341,0.521116,0.181775,2.514741,1.09244,1.422301,struct.PLM,n
FoldX,0.221282,0.398879,0.177597,2.229436,1.336322,0.893114,biophysical,n
mutcomputex,0.326215,0.500248,0.174033,1.387862,1.156582,0.23128,struct.PLM,n
tranception_weights,0.23486,0.407317,0.172457,1.675691,1.224142,0.451549,seq. PLM,n
tranception,0.236225,0.406511,0.170286,1.675464,1.224315,0.451149,seq. PLM,n
msa_transformer_mean,0.301652,0.467062,0.16541,5.835305,1.131992,4.703312,seq.PLM,n
msa_transformer_median,0.288604,0.453755,0.165152,5.95355,1.145814,4.807736,seq.PLM,n
mpnn_20_00,0.394938,0.550892,0.155954,2.362278,1.06756,1.294718,struct.PLM,n
mifst,0.364169,0.519735,0.155566,4.996811,1.095391,3.90142,struct.PLM,n


### Rank improved methods by Correlation

In [73]:
improved = test_rez['delta_pearson'] > 0.0

In [74]:
test_rez.sort_values( 'test_pearson',ascending=False) #'test_RMSE')

Unnamed: 0,plain_pearson,test_pearson,delta_pearson,plain_RMSE,test_RMSE,delta_RMSE,type,mass_balance
stability-oracle,0.618485,0.630406,0.011921,1.187882,1.014463,0.173419,transfer,y
DDGun3D,0.634511,0.628999,-0.005512,1.104507,1.135288,-0.030781,untrained,y
PremPS,0.631689,0.624506,-0.007184,1.027894,1.010993,0.016901,supervised,y
pythia,0.425031,0.616999,0.191968,7.390213,1.216855,-6.173358,struct.PLM,n
Cartddg,0.596848,0.615422,0.018574,3.584026,1.003931,2.580095,biophysical,y
ACDC-NN,0.604194,0.602964,-0.00123,1.06523,1.185123,-0.119893,transfer,y
cartesian_ddg,0.590134,0.600486,0.010352,4.651469,1.056593,3.594877,biophysical,y
KORPM,0.568753,0.594017,0.025264,1.207931,1.139443,0.068488,biophysical,y
INPS3D,0.614779,0.593947,-0.020832,1.012016,1.04399,-0.031975,supervised,y
MAESTRO,0.630013,0.574807,-0.055206,1.042845,1.11991,-0.077065,supervised,y


### Rank improved methods by RMSE

In [75]:
#improved_rmse =  test_rez["delta_RMSE"] > 0.0
improved_rmse_df= test_rez[ improved  ].sort_values("test_RMSE", ascending=True)
improved_rmse_df

Unnamed: 0,plain_pearson,test_pearson,delta_pearson,plain_RMSE,test_RMSE,delta_RMSE,type,mass_balance
Cartddg,0.596848,0.615422,0.018574,3.584026,1.003931,2.580095,biophysical,y
stability-oracle,0.618485,0.630406,0.011921,1.187882,1.014463,0.173419,transfer,y
cartesian_ddg,0.590134,0.600486,0.010352,4.651469,1.056593,3.594877,biophysical,y
ankh,0.43402,0.542109,0.108089,5.589172,1.060666,4.528506,seq.PLM,n
mpnn_20_00,0.394938,0.550892,0.155954,2.362278,1.06756,1.294718,struct.PLM,n
mif,0.448843,0.568634,0.119792,4.344658,1.071738,3.27292,struct.PLM,n
mpnn_30_00,0.399828,0.545135,0.145307,2.343878,1.080932,1.262946,struct.PLM,n
esm2_650M,0.423673,0.529914,0.106241,4.399972,1.085577,3.314396,seq.PLM,n
mpnn_10_00,0.339341,0.521116,0.181775,2.514741,1.09244,1.422301,struct.PLM,n
mifst,0.364169,0.519735,0.155566,4.996811,1.095391,3.90142,struct.PLM,n


### Rank all methods by correlation

In [42]:
#test_all.columns

In [43]:
#all_rez= pandas.DataFrame()
#for _ in ['pearson','spearman']:
#    all_rez[_] = test_all.corrwith(test_461_df['ddG_dir'],method=_)      

In [44]:
#def ndx_me(x):
#    if str(x).endswith('_Rose'):
#        return('ddMBC-corrected')
#    return method_types[x]

#all_rez['type']=[ ndx_me(_) for _ in list(all_rez.index)]


In [45]:
#all_rez[all_rez['pearson'] > 0.45 ].sort_values('pearson',ascending=False)

In [46]:
#all_rez.sort_values('spearman',ascending=False).to_csv('all_rez.csv')

In [47]:
#improved_rmse_d#.to_csv('improved.csv')

In [80]:
test_rez.sort_values('delta_pearson',ascending=False).to_csv("test_rez.csv")