In [2]:
import pandas as pd 
import os 
from tqdm import tqdm 

In [52]:
model_df = pd.read_csv("../../benchmarks/raw_score_files/clinical_subs_AUC_ranking.csv")
dms_df = pd.read_csv("../../raw_score_files/clinical_substitutions_scores.csv")

In [53]:
model_df.columns = ["Model_rank","Model_name", "Average_AUC", "Model type"]
model_df = model_df[["Model_rank","Model_name","Model type","Average_AUC"]]

In [54]:
sup_model_df = model_df[model_df["Model type"] == "Supervised"]
unsup_model_df = model_df[model_df["Model type"] == "Unsupervised"]

In [55]:
display(sup_model_df)
display(unsup_model_df)
display(model_df)

Unnamed: 0,Model_rank,Model_name,Model type,Average_AUC
0,1,ClinPred,Supervised,0.981
1,2,MetaRNN,Supervised,0.977
2,3,BayesDel (addAF),Supervised,0.972
3,4,VEST4,Supervised,0.929
4,5,REVEL,Supervised,0.928
5,6,BayesDel (noAF),Supervised,0.925
6,7,VARITY (R),Supervised,0.921
9,10,VARITY (ER),Supervised,0.918
11,12,gMVP,Supervised,0.914
12,13,CADD,Supervised,0.905


Unnamed: 0,Model_rank,Model_name,Model type,Average_AUC
7,8,TranceptEVE,Unsupervised,0.92
8,9,GEMME,Unsupervised,0.919
10,11,EVE,Unsupervised,0.917
15,16,ESM-1b,Unsupervised,0.892
16,17,PROVEAN,Unsupervised,0.886
19,20,SIFT,Unsupervised,0.878
20,21,SIFT4G,Unsupervised,0.877
21,22,MutationAssessor,Unsupervised,0.877
22,23,MutPred,Unsupervised,0.875
23,24,PrimateAI,Unsupervised,0.855


Unnamed: 0,Model_rank,Model_name,Model type,Average_AUC
0,1,ClinPred,Supervised,0.981
1,2,MetaRNN,Supervised,0.977
2,3,BayesDel (addAF),Supervised,0.972
3,4,VEST4,Supervised,0.929
4,5,REVEL,Supervised,0.928
5,6,BayesDel (noAF),Supervised,0.925
6,7,VARITY (R),Supervised,0.921
7,8,TranceptEVE,Unsupervised,0.92
8,9,GEMME,Unsupervised,0.919
9,10,VARITY (ER),Supervised,0.918


In [56]:
def compute_bootstrap_standard_error(df, number_assay_reshuffle=10000):
    """
    Computes the non-parametric bootstrap standard error for the mean estimate of a given performance metric (eg., Spearman, AUC) across DMS assays (ie., the sample standard deviation of the mean across bootstrap samples)
    """
    model_names = df.columns
    mean_performance_across_samples = []
    for sample in tqdm(range(number_assay_reshuffle)):
        mean_performance_across_samples.append(df.sample(frac=1.0, replace=True).mean(axis=0)) #Resample a dataset of the same size (with replacement) then take the sample mean
    mean_performance_across_samples=pd.DataFrame(data=mean_performance_across_samples,columns=model_names)
    # print(mean_performance_across_samples.head())
    return mean_performance_across_samples.std(ddof=1).T #Unbiased estimate with ddof=1

In [57]:
dms_df = dms_df[["protein"] + [column for column in dms_df.columns if column.startswith("AUC")]]
manual_mapping = {"AUC_BayesDel_addAF_score":"BayesDel (addAF)",
                  "AUC_BayesDel_noAF_score": "BayesDel (noAF)",
                  "AUC_VARITY_R_LOO_score":"VARITY (R)",
                  "AUC_VARITY_ER_LOO_score":"VARITY (ER)",
                  "AUC_Polyphen2_HDIV_score": "PolyPhen2 (HDIV)",
                  "AUC_Polyphen2_HVAR_score":"PolyPhen2 (HVAR)"}
dms_df_sup = dms_df[["protein"] + [column for column in dms_df.columns if column != "protein" and column.split("_")[1] in sup_model_df["Model_name"].tolist()] + list(manual_mapping.keys())]
new_cols = [] 
for column in dms_df_sup.columns:
    if column in manual_mapping:
        new_cols.append(manual_mapping[column])
    elif column == "protein":
        new_cols.append(column)
    else:
        new_cols.append(column.split("_")[1])
dms_df_sup.columns = new_cols 
dms_df_unsup = dms_df[["protein"] + [column for column in dms_df.columns if column != "protein" and column.split("_")[1] in unsup_model_df["Model_name"].tolist()]]
dms_df_unsup.columns = ["protein"] + [column.split("_")[1] for column in dms_df_unsup.columns if column != "protein"]
dms_df_sup = dms_df_sup.set_index("protein")
dms_df_unsup = dms_df_unsup.set_index("protein")
sup_std_err = compute_bootstrap_standard_error(dms_df_sup.subtract(dms_df_sup["ClinPred"],axis=0))
unsup_std_err = compute_bootstrap_standard_error(dms_df_unsup.subtract(dms_df_unsup["TranceptEVE"],axis=0))

100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1054.69it/s]
100%|███████████████████████████████████| 10000/10000 [00:08<00:00, 1155.69it/s]


In [58]:
sup_std_err

MetaRNN             0.001316
REVEL               0.003007
ClinPred            0.000000
gMVP                0.003549
VEST4               0.002694
CADD                0.003044
DEOGEN2             0.003583
MPC                 0.004278
DANN                0.004129
FATHMM              0.005645
MutationTaster      0.003696
BayesDel (addAF)    0.001435
BayesDel (noAF)     0.002835
VARITY (R)          0.003205
VARITY (ER)         0.003371
PolyPhen2 (HDIV)    0.003572
PolyPhen2 (HVAR)    0.003348
dtype: float64

In [59]:
unsup_std_err

SIFT4G              0.003005
PrimateAI           0.003874
SIFT                0.003213
PROVEAN             0.002972
MutationAssessor    0.003449
MutPred             0.004916
LIST-S2             0.004376
LRT                 0.004267
EVE                 0.001854
TranceptEVE         0.000000
GEMME               0.002172
dtype: float64

In [60]:
summary_sup = sup_model_df.merge(pd.DataFrame(sup_std_err,columns=["Bootstrap_standard_error_AUC"]).reset_index(names="Model_name"),on="Model_name")

In [61]:
summary_unsup = unsup_model_df.merge(pd.DataFrame(unsup_std_err,columns=["Bootstrap_standard_error_AUC"]).reset_index(names="Model_name"),on="Model_name")

In [62]:
dms_df_sup

Unnamed: 0_level_0,MetaRNN,REVEL,ClinPred,gMVP,VEST4,CADD,DEOGEN2,MPC,DANN,FATHMM,MutationTaster,BayesDel (addAF),BayesDel (noAF),VARITY (R),VARITY (ER),PolyPhen2 (HDIV),PolyPhen2 (HVAR)
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
NP_001291646.4,1.000000,0.992958,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.978873,0.690141,0.985915,1.000000,0.950704,1.000000,1.000000,0.992958,0.992958
NP_001104262.1,0.977118,0.953757,0.987229,0.971903,0.932631,0.932152,,,0.919221,0.930609,0.841156,0.960834,0.934972,,,0.869519,0.902778
NP_004357.3,1.000000,0.975000,1.000000,0.987500,0.962500,0.950000,1.000000,0.912500,0.950000,0.875000,0.600000,1.000000,0.962500,0.950000,0.962500,0.950000,0.925000
NP_004270.2,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.916667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
NP_055134.2,0.928571,0.392857,0.892857,0.500000,0.607143,0.321429,0.178571,,0.357143,0.071429,0.625000,0.785714,0.428571,0.500000,0.464286,0.357143,0.464286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NP_065069.1,0.500000,0.500000,0.500000,0.500000,0.500000,1.000000,0.500000,0.500000,0.500000,,0.750000,1.000000,1.000000,0.500000,0.500000,0.500000,0.500000
NP_000162.2,1.000000,1.000000,1.000000,,0.990385,0.959936,,,0.817308,0.729167,0.775641,1.000000,0.983974,,,0.846154,0.894231
NP_000479.1,0.957143,0.514706,0.985714,0.970588,0.771429,0.657143,0.647059,0.794118,0.792857,0.647059,0.725000,0.957143,0.721429,0.705882,0.735294,0.661765,0.588235
NP_005586.1,0.875000,0.750000,1.000000,,1.000000,1.000000,0.750000,,0.375000,0.875000,0.500000,1.000000,1.000000,0.500000,0.500000,1.000000,1.000000


In [63]:
dms_df_unsup

Unnamed: 0_level_0,SIFT4G,PrimateAI,SIFT,PROVEAN,MutationAssessor,MutPred,LIST-S2,LRT,EVE,TranceptEVE,GEMME
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
NP_001291646.4,1.000000,0.942446,1.000000,1.000000,1.000000,,0.915493,0.978873,0.914457,0.974337,0.999145
NP_001104262.1,0.707216,0.946254,0.817582,0.907993,,1.000000,0.805396,0.901181,0.932418,0.955407,0.944817
NP_004357.3,0.887500,1.000000,0.925000,0.887500,0.950000,1.000000,0.900000,0.925000,1.000000,1.000000,1.000000
NP_004270.2,1.000000,1.000000,1.000000,1.000000,1.000000,,1.000000,0.833333,1.000000,1.000000,1.000000
NP_055134.2,0.107143,0.607143,0.107143,0.428571,0.428571,0.428571,0.285714,0.750000,0.464286,0.464286,0.500000
...,...,...,...,...,...,...,...,...,...,...,...
NP_065069.1,0.500000,0.500000,0.500000,0.500000,0.500000,0.000000,,0.750000,1.000000,1.000000,1.000000
NP_000162.2,0.980769,0.820513,0.985577,0.977564,1.000000,1.000000,0.926282,0.722756,1.000000,1.000000,1.000000
NP_000479.1,0.717857,0.600000,0.647059,0.529412,0.823529,0.720000,0.692857,0.635714,0.828571,0.721429,0.735714
NP_005586.1,0.687500,0.500000,1.000000,0.750000,1.000000,,1.000000,0.500000,0.750000,0.875000,0.750000


In [64]:
summary_sup = summary_sup.sort_values(by="Average_AUC",ascending=False)
summary_sup["Model_rank"] = list(range(1,len(summary_sup)+1))
summary_sup

Unnamed: 0,Model_rank,Model_name,Model type,Average_AUC,Bootstrap_standard_error_AUC
0,1,ClinPred,Supervised,0.981,0.0
1,2,MetaRNN,Supervised,0.977,0.001316
2,3,BayesDel (addAF),Supervised,0.972,0.001435
3,4,VEST4,Supervised,0.929,0.002694
4,5,REVEL,Supervised,0.928,0.003007
5,6,BayesDel (noAF),Supervised,0.925,0.002835
6,7,VARITY (R),Supervised,0.921,0.003205
7,8,VARITY (ER),Supervised,0.918,0.003371
8,9,gMVP,Supervised,0.914,0.003549
9,10,CADD,Supervised,0.905,0.003044


In [65]:
summary_unsup = summary_unsup.sort_values(by="Average_AUC",ascending=False)
summary_unsup["Model_rank"] = list(range(1,len(summary_unsup)+1))
summary_unsup

Unnamed: 0,Model_rank,Model_name,Model type,Average_AUC,Bootstrap_standard_error_AUC
0,1,TranceptEVE,Unsupervised,0.92,0.0
1,2,GEMME,Unsupervised,0.919,0.002172
2,3,EVE,Unsupervised,0.917,0.001854
3,4,PROVEAN,Unsupervised,0.886,0.002972
4,5,SIFT,Unsupervised,0.878,0.003213
5,6,SIFT4G,Unsupervised,0.877,0.003005
6,7,MutationAssessor,Unsupervised,0.877,0.003449
7,8,MutPred,Unsupervised,0.875,0.004916
8,9,PrimateAI,Unsupervised,0.855,0.003874
9,10,LIST-S2,Unsupervised,0.842,0.004376


In [69]:
dms_df_sup.index.names = ["RefSeq ID"] 
dms_df_unsup.index.names = ["RefSeq ID"]
dms_df_sup.round(3).to_csv("../../benchmarks/clinical_supervised/substitutions/AUC/clinical_substitutions_AUC_DMS_level.csv")
dms_df_unsup.round(3).to_csv("../../benchmarks/clinical_zero_shot/substitutions/AUC/clinical_substitutions_AUC_DMS_level.csv")
summary_sup.round(3).to_csv("../../benchmarks/clinical_supervised/substitutions/AUC/Summary_performance_clinical_substitutions_AUC.csv",index=False)
summary_unsup.round(3).to_csv("../../benchmarks/clinical_zero_shot/substitutions/AUC/Summary_performance_clinical_substitutions_AUC.csv",index=False)

# Getting files for indels 

In [22]:
# Change this to local folder where indel scores are stored (scores available on proteingym.org)
score_folder = "/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels"
ref_df = pd.read_csv("../../reference_files/clinical_indels.csv")

In [23]:
# There are the input dataset csvs for the clinical indels (also available on proteingym.org)
variant_folder = "/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/Clinical_datasets/indels"
dms_df = []
for file in os.listdir(variant_folder):
    df = pd.read_csv(f"{variant_folder}/{file}")
    df["DMS_id"] = os.path.splitext(file)[0]
    dms_df.append(df)
dms_df = pd.concat(dms_df)
dms_df["DMS_bin_score"].value_counts()

DMS_bin_score
1    1760
0     839
Name: count, dtype: int64

In [24]:
dms_df = dms_df.rename(columns={"mutant":"mutated_sequence"})
dms_df

Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1
0,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE
0,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672
0,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82
0,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55
...,...,...,...
0,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1
0,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1
0,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3
0,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1


In [25]:
column_map = { 
    "HMM":"HMM_score",
    "Progen2":"Progen2_score_wt_ratio",
    "RITA":"RITA_score_wt_ratio",
    "ProtGPT2":"ProtGPT2_score_wt_ratio",
    "Tranception":"avg_score",
    "Provean":"provean_score",
    "Unirep":"Unirep_score_wt_ratio"
}

In [26]:
all_dfs = []
for model_score_folder in tqdm(os.listdir(score_folder)):
    print("-----------------------------------------------------")
    model_dir = f"{score_folder}/{model_score_folder}"
    if all([os.path.isdir(f"{model_dir}/{file}") for file in os.listdir(f"{model_dir}")]):
        print(f"Detected subdirectories in {model_score_folder}, finding scores in each subdirectory")
        for subdir in os.listdir(model_dir):
            print(model_dir, subdir)
            comb_df = []
            for i,row in ref_df.iterrows():
                score_df = pd.read_csv(f"{model_dir}/{subdir}/{row['DMS_id']}.csv")
                score_df["target_seq"] = row["target_seq"] 
                comb_df.append(score_df)
            comb_df = pd.concat(comb_df).drop_duplicates(subset="mutated_sequence")
            dms_df = dms_df.merge(comb_df[["mutated_sequence",column_map[model_score_folder]]],on="mutated_sequence",how="left")
            dms_df = dms_df.rename(columns={column_map[model_score_folder]:model_score_folder + "_" + subdir})
    else:
        print(model_score_folder)
        comb_df = []
        for i,row in ref_df.iterrows():
            score_df = pd.read_csv(f"{model_dir}/{row['DMS_id']}.csv")
            score_df["target_seq"] = row["target_seq"] 
            score_df["dataset"] = row["dataset"] 
            comb_df.append(score_df)
        comb_df = pd.concat(comb_df).drop_duplicates(subset="mutated_sequence")
        dms_df = dms_df.merge(comb_df[["mutated_sequence",column_map[model_score_folder]]],on="mutated_sequence",how="left")
        dms_df = dms_df.rename(columns={column_map[model_score_folder]:model_score_folder})
        print("-----------------------------------------------------")

  0%|                                                     | 0/7 [00:00<?, ?it/s]

-----------------------------------------------------
Provean


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938
...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928


 14%|██████▍                                      | 1/7 [00:04<00:29,  4.89s/it]

-----------------------------------------------------
-----------------------------------------------------
Detected subdirectories in Tranception, finding scores in each subdirectory
/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Tranception Tranception_M_no_retrieval


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168
...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Tranception Tranception_M


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240
...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Tranception Tranception_L_no_retrieval


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027
...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Tranception Tranception_L


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503
...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751


 29%|████████████▊                                | 2/7 [00:20<00:55, 11.16s/it]

-----------------------------------------------------
HMM


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015
...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039


 43%|███████████████████▎                         | 3/7 [00:25<00:34,  8.55s/it]

-----------------------------------------------------
-----------------------------------------------------
Detected subdirectories in RITA, finding scores in each subdirectory
/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/RITA xlarge


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163
...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/RITA large


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098
...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/RITA small


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622
...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/RITA medium


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/RITA ensemble


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium,RITA_ensemble
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294,-0.244554
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019,0.000671
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019,-0.126119
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988,-0.011890
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671,0.000008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783,-0.505885
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763,0.150881
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389,0.000333
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762,-0.191657


 57%|█████████████████████████▋                   | 4/7 [00:47<00:41, 13.67s/it]

-----------------------------------------------------
ProtGPT2


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium,RITA_ensemble,ProtGPT2
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294,-0.244554,-0.047654
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019,0.000671,-0.024506
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019,-0.126119,-0.012255
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988,-0.011890,0.010934
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671,0.000008,0.001553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783,-0.505885,-0.202555
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763,0.150881,-0.023925
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389,0.000333,-0.006887
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762,-0.191657,-0.005451


 71%|████████████████████████████████▏            | 5/7 [00:52<00:21, 10.59s/it]

-----------------------------------------------------
-----------------------------------------------------
Unirep


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium,RITA_ensemble,ProtGPT2,Unirep
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294,-0.244554,-0.047654,0.084321
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019,0.000671,-0.024506,0.006018
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019,-0.126119,-0.012255,0.012318
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988,-0.011890,0.010934,0.002105
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671,0.000008,0.001553,-0.005755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783,-0.505885,-0.202555,-0.001551
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763,0.150881,-0.023925,-0.000730
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389,0.000333,-0.006887,0.002791
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762,-0.191657,-0.005451,0.002049


 86%|██████████████████████████████████████▌      | 6/7 [00:57<00:08,  8.79s/it]

-----------------------------------------------------
-----------------------------------------------------
Detected subdirectories in Progen2, finding scores in each subdirectory
/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Progen2 XLarge


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium,RITA_ensemble,ProtGPT2,Unirep,Progen2_XLarge
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294,-0.244554,-0.047654,0.084321,-0.000268
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019,0.000671,-0.024506,0.006018,-0.000005
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019,-0.126119,-0.012255,0.012318,-0.000111
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988,-0.011890,0.010934,0.002105,0.000003
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671,0.000008,0.001553,-0.005755,-0.000007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783,-0.505885,-0.202555,-0.001551,-0.003069
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763,0.150881,-0.023925,-0.000730,0.000023
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389,0.000333,-0.006887,0.002791,-0.000007
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762,-0.191657,-0.005451,0.002049,-0.000362


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Progen2 base


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium,RITA_ensemble,ProtGPT2,Unirep,Progen2_XLarge,Progen2_base
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294,-0.244554,-0.047654,0.084321,-0.000268,-4.123227e-04
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019,0.000671,-0.024506,0.006018,-0.000005,-1.835451e-06
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019,-0.126119,-0.012255,0.012318,-0.000111,-1.504642e-04
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988,-0.011890,0.010934,0.002105,0.000003,-2.603597e-06
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671,0.000008,0.001553,-0.005755,-0.000007,2.741197e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783,-0.505885,-0.202555,-0.001551,-0.003069,-2.038769e-03
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763,0.150881,-0.023925,-0.000730,0.000023,1.825281e-07
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389,0.000333,-0.006887,0.002791,-0.000007,-7.308263e-06
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762,-0.191657,-0.005451,0.002049,-0.000362,-3.104178e-05


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Progen2 large


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium,RITA_ensemble,ProtGPT2,Unirep,Progen2_XLarge,Progen2_base,Progen2_large
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294,-0.244554,-0.047654,0.084321,-0.000268,-4.123227e-04,-0.000417
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019,0.000671,-0.024506,0.006018,-0.000005,-1.835451e-06,-0.000007
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019,-0.126119,-0.012255,0.012318,-0.000111,-1.504642e-04,-0.000098
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988,-0.011890,0.010934,0.002105,0.000003,-2.603597e-06,-0.000008
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671,0.000008,0.001553,-0.005755,-0.000007,2.741197e-06,-0.000002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783,-0.505885,-0.202555,-0.001551,-0.003069,-2.038769e-03,-0.001900
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763,0.150881,-0.023925,-0.000730,0.000023,1.825281e-07,0.000024
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389,0.000333,-0.006887,0.002791,-0.000007,-7.308263e-06,-0.000009
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762,-0.191657,-0.005451,0.002049,-0.000362,-3.104178e-05,-0.000129


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Progen2 small


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,RITA_large,RITA_small,RITA_medium,RITA_ensemble,ProtGPT2,Unirep,Progen2_XLarge,Progen2_base,Progen2_large,Progen2_small
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,-0.288258,-0.141813,-0.281294,-0.244554,-0.047654,0.084321,-0.000268,-4.123227e-04,-0.000417,-0.000178
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,0.003223,-0.003577,-0.000019,0.000671,-0.024506,0.006018,-0.000005,-1.835451e-06,-0.000007,-0.000004
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,-0.147896,-0.081753,-0.130019,-0.126119,-0.012255,0.012318,-0.000111,-1.504642e-04,-0.000098,-0.000080
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,-0.019391,-0.018378,-0.017988,-0.011890,0.010934,0.002105,0.000003,-2.603597e-06,-0.000008,-0.000011
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,-0.002098,0.007622,0.001671,0.000008,0.001553,-0.005755,-0.000007,2.741197e-06,-0.000002,0.000006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,-0.869825,-0.017233,-0.188783,-0.505885,-0.202555,-0.001551,-0.003069,-2.038769e-03,-0.001900,-0.000763
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,0.147280,0.154035,0.150763,0.150881,-0.023925,-0.000730,0.000023,1.825281e-07,0.000024,0.000025
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,0.002405,-0.001330,-0.000389,0.000333,-0.006887,0.002791,-0.000007,-7.308263e-06,-0.000009,-0.000012
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,-0.186703,-0.038070,-0.074762,-0.191657,-0.005451,0.002049,-0.000362,-3.104178e-05,-0.000129,-0.000025


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Progen2 Medium


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,...,RITA_small,RITA_medium,RITA_ensemble,ProtGPT2,Unirep,Progen2_XLarge,Progen2_base,Progen2_large,Progen2_small,Progen2_Medium
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,...,-0.141813,-0.281294,-0.244554,-0.047654,0.084321,-0.000268,-4.123227e-04,-0.000417,-0.000178,-0.000388
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,...,-0.003577,-0.000019,0.000671,-0.024506,0.006018,-0.000005,-1.835451e-06,-0.000007,-0.000004,-0.000005
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,...,-0.081753,-0.130019,-0.126119,-0.012255,0.012318,-0.000111,-1.504642e-04,-0.000098,-0.000080,-0.000162
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,...,-0.018378,-0.017988,-0.011890,0.010934,0.002105,0.000003,-2.603597e-06,-0.000008,-0.000011,-0.000010
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,...,0.007622,0.001671,0.000008,0.001553,-0.005755,-0.000007,2.741197e-06,-0.000002,0.000006,0.000002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,...,-0.017233,-0.188783,-0.505885,-0.202555,-0.001551,-0.003069,-2.038769e-03,-0.001900,-0.000763,-0.001437
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,...,0.154035,0.150763,0.150881,-0.023925,-0.000730,0.000023,1.825281e-07,0.000024,0.000025,0.000024
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,...,-0.001330,-0.000389,0.000333,-0.006887,0.002791,-0.000007,-7.308263e-06,-0.000009,-0.000012,-0.000011
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,...,-0.038070,-0.074762,-0.191657,-0.005451,0.002049,-0.000362,-3.104178e-05,-0.000129,-0.000025,-0.000070


/n/groups/marks/projects/marks_lab_and_oatml/ProteinGym/model_scores/zero_shot_clinical_indels/Progen2 ensemble


Unnamed: 0,mutated_sequence,DMS_bin_score,DMS_id,Provean,Tranception_Tranception_M_no_retrieval,Tranception_Tranception_M,Tranception_Tranception_L_no_retrieval,Tranception_Tranception_L,HMM,RITA_xlarge,...,RITA_medium,RITA_ensemble,ProtGPT2,Unirep,Progen2_XLarge,Progen2_base,Progen2_large,Progen2_small,Progen2_Medium,Progen2_ensemble
0,MEEPEEPADSGQSLVPVYIYSPEYVSMCDSLAKIPKRASMVHSLIE...,1,NP_060956.1,-17.363,-0.133442,-0.049923,-0.153706,-0.057961,-1.546509,-0.266850,...,-0.281294,-0.244554,-0.047654,0.084321,-0.000268,-4.123227e-04,-0.000417,-0.000178,-0.000388,-3.326833e-04
1,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...,0,UPI0000125CAE,-1.896,-0.000314,0.001216,-0.000853,0.001001,2.155273,0.003055,...,-0.000019,0.000671,-0.024506,0.006018,-0.000005,-1.835451e-06,-0.000007,-0.000004,-0.000005,-4.340888e-06
2,MAKKVAVIGAGVSGLISLKCCVDEGLEPTCFERTEDIGGVWRFKEN...,0,UPI000013C672,-8.575,-0.050021,-0.023363,-0.054832,-0.026527,-7.489136,-0.144808,...,-0.130019,-0.126119,-0.012255,0.012318,-0.000111,-1.504642e-04,-0.000098,-0.000080,-0.000162,-1.203662e-04
3,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...,0,UPI000035AA82,-0.660,-0.001915,-0.001803,-0.001869,-0.001739,2.226807,0.008198,...,-0.017988,-0.011890,0.010934,0.002105,0.000003,-2.603597e-06,-0.000008,-0.000011,-0.000010,-5.596101e-06
4,MNYPGRGSPRSPEHNGRGGGGGAWELGSDARPAFGGGVCCFEHLPG...,0,UPI00001D7B55,-0.938,0.001168,-0.006240,0.001027,-0.006503,-7.765015,-0.007163,...,0.001671,0.000008,0.001553,-0.005755,-0.000007,2.741197e-06,-0.000002,0.000006,0.000002,4.799781e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,MRKHVLAASFSMLSLLVIMGDTDSKTDSSFIMDSDPRRCMRHHYVD...,1,NP_000257.1,-19.412,-0.060956,-0.025507,-0.363128,-0.144996,0.289551,-0.947698,...,-0.188783,-0.505885,-0.202555,-0.001551,-0.003069,-2.038769e-03,-0.001900,-0.000763,-0.001437,-1.841712e-03
2595,MVFESVVVDVLNRFLGDYVVDLDTSQLSLGIWKGAVALKNLQIKEN...,0,UPI000015FEE1,-0.759,0.000792,0.000135,0.000846,0.000072,-2.996582,0.151447,...,0.150763,0.150881,-0.023925,-0.000730,0.000023,1.825281e-07,0.000024,0.000025,0.000024,1.904155e-05
2596,MKALIAALLLITLQYSCAVSPTDCSAVEPEAEKALDLINKRRRDGY...,0,UPI000012CBC3,-11.948,0.000075,-0.000514,-0.000123,-0.000587,-1.841553,0.000644,...,-0.000389,0.000333,-0.006887,0.002791,-0.000007,-7.308263e-06,-0.000009,-0.000012,-0.000011,-9.179144e-06
2597,MAFVPVIPESYSHVLAEFESLDPLLSALRLDSSRLKCTSIAVSRKW...,1,NP_852608.1,-10.928,-0.001379,-0.001183,-0.001036,-0.000751,-3.540039,-0.467092,...,-0.074762,-0.191657,-0.005451,0.002049,-0.000362,-3.104178e-05,-0.000129,-0.000025,-0.000070,-1.233418e-04


100%|█████████████████████████████████████████████| 7/7 [01:22<00:00, 11.85s/it]


In [36]:
dms_df.loc[dms_df["Unirep"].isna(),"Unirep"] = dms_df["Unirep"].mean()

In [37]:
dms_df.isna().any()

mutated_sequence                          False
DMS_bin_score                             False
DMS_id                                    False
Provean                                   False
Tranception_Tranception_M_no_retrieval    False
Tranception_Tranception_M                 False
Tranception_Tranception_L_no_retrieval    False
Tranception_Tranception_L                 False
HMM                                       False
RITA_xlarge                               False
RITA_large                                False
RITA_small                                False
RITA_medium                               False
RITA_ensemble                             False
ProtGPT2                                  False
Unirep                                    False
Progen2_XLarge                            False
Progen2_base                              False
Progen2_large                             False
Progen2_small                             False
Progen2_Medium                          

In [None]:
def compute_std_error_indels(df):
    model_names = df.columns
    mean_performance_across_samples = []
    for sample in tqdm(range(number_assay_reshuffle)):
        mean_performance_across_samples.append(df.sample(frac=1.0, replace=True).mean(axis=0)) #Resample a dataset of the same size (with replacement) then take the sample mean
    mean_performance_across_samples=pd.DataFrame(data=mean_performance_across_samples,columns=model_names)
    # print(mean_performance_across_samples.head())
    return mean_performance_across_samples.std(ddof=1).T #Unbiased estimate with ddof=1

In [54]:
dms_df = dms_df.drop(columns=["RITA_ensemble","Progen2_ensemble"])

In [55]:
from sklearn.metrics import roc_auc_score
auc_df = {"Model_name":[],"Average_AUC":[]}
for column in dms_df.columns:
    if column in ["mutated_sequence","DMS_bin_score","DMS_id"]:
        continue 
    auc_df["Average_AUC"].append(roc_auc_score(~dms_df["DMS_bin_score"],dms_df[column]))
    auc_df["Model_name"].append(column)

In [61]:
auc_df = pd.DataFrame(auc_df)
auc_df = auc_df.sort_values(by="Average_AUC",ascending=False)
auc_df["Model_rank"] = list(range(1,len(auc_df)+1))
auc_df["Model type"] = "Unsupervised"
auc_df = auc_df[["Model_rank","Model_name","Model type","Average_AUC"]]
auc_df["Model_name"] = ["Provean","RITA L", "RITA XL", "RITA M", "Tranception L no retrieval", "ProGen2 L", "Tranception M no retrieval", "ProGen2 M", "ProGen2 Base", "Tranception L", "ProGen2 XL", "Tranception M", "RITA S", "ProGen2 S", "Hidden Markov Model", "ProtGPT2", "UniRep"]
auc_df

Unnamed: 0,Model_rank,Model_name,Model type,Average_AUC
0,1,Provean,Unsupervised,0.92659
7,2,RITA L,Unsupervised,0.921079
6,3,RITA XL,Unsupervised,0.915105
9,4,RITA M,Unsupervised,0.891299
3,5,Tranception L no retrieval,Unsupervised,0.862819
14,6,ProGen2 L,Unsupervised,0.847056
1,7,Tranception M no retrieval,Unsupervised,0.844785
16,8,ProGen2 M,Unsupervised,0.843086
13,9,ProGen2 Base,Unsupervised,0.842362
4,10,Tranception L,Unsupervised,0.83999


In [62]:
auc_df.round(3).to_csv("../../benchmarks/clinical_zero_shot/indels/AUC/Summary_performance_clinical_indels_AUC.csv",index=False)