## Summary

---

## Imports

In [2]:
import functools
from pathlib import Path

import elaspic2 as el2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from elaspic2.plugins.msa_analysis import analyze_msa
from scipy import stats
from sklearn import metrics, model_selection
from tqdm.auto import tqdm



In [3]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [4]:
NOTEBOOK_DIR = Path("37_cagi6_sherloc_combine_results").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/lustre07/scratch/mjslee/workspace/elaspic2-cagi6/notebooks/37_cagi6_sherloc_combine_results')

## Load results

In [5]:
DATASET_NAME = "cagi6-sherloc"
DATASET_PATH = str(
    NOTEBOOK_DIR.parent.joinpath("30_cagi6_sherloc", "input-data-gby-protein.parquet")
)
TASK_COUNT = 4182

DATASET_NAME, DATASET_PATH, TASK_COUNT

('cagi6-sherloc',
 '/lustre07/scratch/mjslee/workspace/elaspic2-cagi6/notebooks/30_cagi6_sherloc/input-data-gby-protein.parquet',
 4182)

In [5]:
pfile = pq.ParquetFile(DATASET_PATH)

assert TASK_COUNT == pfile.num_row_groups

In [6]:
total_num_mutations = 0
for row_group in tqdm(range(pfile.num_row_groups)):
    num_mutations = (
        pfile.read_row_group(row_group, columns=["mutation"])
        .to_pandas()["mutation"]
        .str.len()
        .sum()
    )

    total_num_mutations += num_mutations

total_num_mutations

NameError: name 'pfile' is not defined

In [13]:
def get_result_files(result_dir, task_count=TASK_COUNT):
    if any([f in str(result_dir) for f in ["msa_analysis","ccmpred"]]):
        prefix = "result"
    elif "31_run_alphafold/" in str(result_dir):
        prefix = "results"
    else:
        prefix = "shard"

    present_files = []
    missing_files = []
    for i in tqdm(range(1, task_count + 1)):
        path = result_dir.joinpath(f"{prefix}-{i}-of-{task_count}.parquet")
        if path.is_file():
            present_files.append(path)
        else:
            missing_files.append(path)
    return present_files, missing_files

In [8]:
def read_files(files):
    dfs = []
    for file in tqdm(files):
        df = pq.read_table(file).to_pandas(integer_object_nulls=True)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [9]:
def read_rosetta_files(pfile):
    result_dfs = []
    for task_id in tqdm(range(1, pfile.num_row_groups + 1)):
        row = (
            pfile.read_row_group(
                task_id - 1, columns=["protein_id", "mutation_id", "mutation"]
            )
            .to_pandas()
            .iloc[0]
        )

        input_df = pd.DataFrame(
            {
                "protein_id": [row["protein_id"]] * len(row["mutation_id"]),
                "mutation_id": row["mutation_id"],
                "mutation": row["mutation"],
            }
        )

        path = NOTEBOOK_DIR.parent.joinpath(
            "31_run_rosetta_ddg",
            DATASET_NAME,
            f"shard-{task_id}-of-{TASK_COUNT}.parquet",
        )
        try:
            rosetta_df = pq.read_table(path).to_pandas()
        except pa.ArrowInvalid:
            print(f"Unreadable file for {path}")
            continue

        if rosetta_df.empty:
            print(f"Empty file for {path}")
            continue

        del rosetta_df["protein_id"]
        del rosetta_df["mutation_id"]

        result_df = input_df.merge(rosetta_df, on=["mutation"])
        result_dfs.append(result_df)
    return pd.concat(result_dfs, ignore_index=True)

### ProteinSolver

In [10]:
ps_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_proteinsolver", DATASET_NAME)

In [11]:
present_files, missing_files = get_result_files(ps_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




(4182, 0)

In [12]:
result_ps_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




In [13]:
display(result_ps_df.head(2))
print(len(result_ps_df))  # 221816

Unnamed: 0,protein_id,mutation,mutation_id,effect,proteinsolver_core_score_wt,proteinsolver_core_score_mut,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,proteinsolver_core_features_residue_mut,proteinsolver_core_features_protein_mut
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.091593,0.003965,"[0.48249053955078125, 0.24484702944755554, -0....","[-2.27313494682312, 0.6932704448699951, -0.195...","[-1.904166340827942, 1.0505309104919434, 0.088...","[-2.3197972774505615, 0.8016344308853149, -0.1..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.041601,0.000594,"[2.043767213821411, 0.9361261129379272, -0.415...","[-2.27313494682312, 0.6932705044746399, -0.195...","[0.12865065038204193, 0.6873304843902588, 2.20...","[-2.2835092544555664, 0.6785972118377686, -0.1..."


221816


### ProtBert

In [14]:
pb_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_protbert", DATASET_NAME)

In [15]:
present_files, missing_files = get_result_files(pb_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




(4182, 0)

In [16]:
result_pb_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




In [17]:
display(result_pb_df.head(2))
print(len(result_pb_df))  # 221793

Unnamed: 0,protein_id,mutation,mutation_id,effect,protbert_core_score_wt,protbert_core_score_mut,protbert_core_features_residue_wt,protbert_core_features_protein_wt,protbert_core_features_residue_mut,protbert_core_features_protein_mut
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.068529,0.007849,"[0.15305250883102417, -0.11011786758899689, 0....","[0.05504663661122322, -0.04564127326011658, 0....","[0.09072457253932953, -0.12460881471633911, 0....","[0.02796473540365696, -0.055114783346652985, 0..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.062104,0.02722,"[0.056363195180892944, -0.020384633913636208, ...","[0.05504663661122322, -0.04564127326011658, 0....","[0.03299185633659363, -0.007563109043985605, 0...","[0.04508848860859871, -0.052698392421007156, 0..."


221793


### Rosetta

In [18]:
ra_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_rosetta_ddg", DATASET_NAME)

In [19]:
present_files, missing_files = get_result_files(ra_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




(4182, 0)

In [20]:
result_ra_df = read_rosetta_files(pfile)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))

Empty file for /gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_rosetta_ddg/cagi6-sherloc/shard-1098-of-4182.parquet



In [21]:
display(result_ra_df.head(2))
print(len(result_ra_df))  # 221793

Unnamed: 0,protein_id,mutation_id,mutation,effect,rosetta_cart_bonded_wt,rosetta_rama_prepro_wt,rosetta_ref_wt,rosetta_hxl_tors_wt,rosetta_p_aa_pp_wt,rosetta_fa_dun_semi_wt,rosetta_fa_dun_rot_wt,rosetta_fa_dun_dev_wt,rosetta_omega_wt,rosetta_dslf_fa13_wt,rosetta_hbond_sc_wt,rosetta_hbond_bb_sc_wt,rosetta_hbond_lr_bb_wt,rosetta_hbond_sr_bb_wt,rosetta_fa_intra_elec_wt,rosetta_fa_elec_wt,rosetta_lk_ball_bridge_uncpl_wt,rosetta_lk_ball_bridge_wt,rosetta_lk_ball_iso_wt,rosetta_lk_ball_wt,rosetta_fa_intra_sol_xover4_wt,rosetta_fa_intra_rep_xover4_wt,rosetta_fa_intra_atr_xover4_wt,rosetta_fa_sol_wt,rosetta_fa_rep_wt,rosetta_fa_atr_wt,rosetta_dg_wt,rosetta_cart_bonded_change,rosetta_rama_prepro_change,rosetta_ref_change,rosetta_hxl_tors_change,rosetta_p_aa_pp_change,rosetta_fa_dun_semi_change,rosetta_fa_dun_rot_change,rosetta_fa_dun_dev_change,rosetta_omega_change,rosetta_dslf_fa13_change,rosetta_hbond_sc_change,rosetta_hbond_bb_sc_change,rosetta_hbond_lr_bb_change,rosetta_hbond_sr_bb_change,rosetta_fa_intra_elec_change,rosetta_fa_elec_change,rosetta_lk_ball_bridge_uncpl_change,rosetta_lk_ball_bridge_change,rosetta_lk_ball_iso_change,rosetta_lk_ball_change,rosetta_fa_intra_sol_xover4_change,rosetta_fa_intra_rep_xover4_change,rosetta_fa_intra_atr_xover4_change,rosetta_fa_sol_change,rosetta_fa_rep_change,rosetta_fa_atr_change,rosetta_dg_change
0,P26678,NM_002667.3:c.25C>T,R9C,Pathogenic,34.796667,18.080333,13.08,5.875333,-4.878333,33.826333,30.42,37.201667,16.900333,0.0,0.0,-0.495,0.0,-46.239,-5.748,-96.294333,-1.515667,-0.220333,-145.437333,123.757667,12.501667,9.146667,-22.659333,279.312667,18.309,-311.028667,-1.308,-0.577667,0.614667,4.553,0.021333,0.376,0.173,-1.592,-0.074667,-0.012667,0.0,0.0,0.495,0.0,-0.011,-0.510333,0.204667,0.508667,0.04,1.679,-0.050333,-0.104333,-0.147,0.325,-4.542667,0.084667,2.215667,3.669
1,P26678,NM_002667.3:c.22A>C,T8P,Uncertain significance,35.531,17.849,13.08,9.764,-5.052,33.549,30.357,43.524,16.889,0.0,0.0,-0.743,0.0,-45.885,-4.684,-97.176,-1.163,-0.196,-144.679,123.189,12.223,9.169,-22.419,278.164,18.065,-310.411,8.945,12.124333,2.623,-3.697,-2.731667,0.545667,1.057667,0.810667,0.199333,1.852333,0.0,-1.577,0.007667,0.0,1.358,0.671,1.583667,0.171667,0.014,0.872,0.017,-0.134,-0.1,0.435333,-1.319333,6.835,-1.977333,19.644


221793


### MSA

In [22]:
msa_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_msa_analysis", DATASET_NAME)

In [23]:
present_files, missing_files = get_result_files(msa_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




(4182, 0)

In [24]:
result_msa_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




In [25]:
display(result_msa_df.head(2))
print(len(result_msa_df))  # 221793

Unnamed: 0,protein_id,mutation_id,mutation,msa_count_wt,msa_count_mut,msa_count_total,msa_proba_wt,msa_proba_mut,msa_proba_total,msa_length,msa_proba,msa_H,msa_KL
0,P26678,NM_002667.3:c.25C>T,R9C,87.0,0.0,100.0,-0.310155,-4.787492,-88.633441,101,-0.536553,0.326639,2.625994
1,P26678,NM_002667.3:c.22A>C,T8P,95.0,0.0,100.0,-0.223144,-4.787492,-89.393727,101,-0.536553,0.074447,2.917469


221793


### AlphaFold

In [26]:
af_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_alphafold", DATASET_NAME)

In [27]:
present_files, missing_files = get_result_files(af_result_dir)

len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




(4182, 0)

In [28]:
result_af_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




In [29]:
display(result_af_df.head(2))
print(len(result_af_df))  # 1,069,652

Unnamed: 0,protein_id,mutation,alphafold_core_score_plddt,alphafold_core_features_experimentally_resolved,alphafold_core_features_predicted_lddt,alphafold_core_features_msa_first_row,alphafold_core_features_single,alphafold_core_features_structure_module,alphafold_core_score_predicted_aligned_error_row_mean,alphafold_core_score_predicted_aligned_error_col_mean,alphafold_core_score_predicted_aligned_error_diag,alphafold_core_features_distogram_row_mean,alphafold_core_features_pair_row_mean,alphafold_core_features_distogram_row_max,alphafold_core_features_pair_row_max,alphafold_core_features_distogram_col_mean,alphafold_core_features_pair_col_mean,alphafold_core_features_distogram_col_max,alphafold_core_features_pair_col_max,alphafold_core_features_distogram_diag,alphafold_core_features_pair_diag,alphafold_core_score_msa_logproba_first_wt,alphafold_core_score_msa_logproba_first_mut,alphafold_core_score_msa_logproba_mean_wt,alphafold_core_score_msa_logproba_mean_mut,alphafold_core_score_msa_logproba_max_wt,alphafold_core_score_msa_logproba_max_mut,alphafold_core_features_msa_first,alphafold_core_features_msa_mean,alphafold_core_features_msa_max
0,P26678,R9C,60.591231,"[1.1957175, 1.2158396, 1.4719502, 1.0110891, 1...","[-6.112609, -7.1444016, -6.2029376, -5.485528,...","[-1.8621206, 4.2266498, -12.190193, 5.9252286,...","[19.881775, -7.916237, 24.432047, -24.314926, ...","[0.0069257915, 0.010214679, -0.0057431664, 0.0...",17.316183,15.080112,0.260079,"[-4.09479360626294, -8.069414707330557, -8.000...","[2.7896990598394322, 10.789292047230097, -2.49...","[155.1204071044922, 2.5760769844055176, 3.8069...","[34.4866943359375, 747.40869140625, 18.8361473...","[-4.09479360626294, -8.069414707330557, -8.000...","[1.7381559366790147, 11.034781367159807, -4.02...","[155.1204071044922, 2.5760769844055176, 3.8069...","[34.4866943359375, 747.40869140625, 18.8361473...","[155.1204071044922, -9.181709289550781, -33.72...","[34.4866943359375, 747.40869140625, 18.8361473...",-0.002302,-9.826518,-1.083345,-4.675444,-0.000687,-3.604242,"[0.6741787195205688, 3.8331971168518066, -18.1...","[-0.5440018840838136, -1.4893573264437399, -2....","[4.5283708572387695, 11.694952964782715, 5.611..."
1,P26678,T8P,60.938517,"[-0.9147486, -0.931573, -1.0168346, -1.1108581...","[-5.815353, -7.58807, -6.37847, -5.5572815, -4...","[5.016503, 6.580977, -4.220338, 2.137466, -4.0...","[3.3123631, 13.751325, 40.752693, -40.537956, ...","[0.0055783167, 0.006282449, -0.0057957303, 0.0...",18.275015,16.204138,0.250928,"[-4.033567515703348, -7.808617578102992, -7.83...","[3.765430871110696, 9.19120905491022, -2.54373...","[156.63250732421875, 2.5760769844055176, 3.806...","[51.281803131103516, 737.6687622070312, 13.462...","[-4.033567515703348, -7.808617578102992, -7.83...","[1.717591334420901, 9.950064335304956, -2.4488...","[156.63250732421875, 2.5760769844055176, 3.806...","[51.281803131103516, 737.6687622070312, 13.462...","[156.63250732421875, -7.953685283660889, -38.0...","[51.281803131103516, 737.6687622070312, 13.462...",-0.001565,-8.594054,-1.364722,-4.070019,-0.000725,-3.079609,"[7.693594932556152, 13.351690292358398, -8.099...","[-0.5604841870175102, -0.5929731388730327, -1....","[7.693594932556152, 13.933950424194336, 2.5906..."


1069652


In [30]:
result_af_df = result_af_df.drop_duplicates(subset=["protein_id", "mutation"])

In [31]:
display(result_af_df.head(2))
print(len(result_af_df))  # 216263

Unnamed: 0,protein_id,mutation,alphafold_core_score_plddt,alphafold_core_features_experimentally_resolved,alphafold_core_features_predicted_lddt,alphafold_core_features_msa_first_row,alphafold_core_features_single,alphafold_core_features_structure_module,alphafold_core_score_predicted_aligned_error_row_mean,alphafold_core_score_predicted_aligned_error_col_mean,alphafold_core_score_predicted_aligned_error_diag,alphafold_core_features_distogram_row_mean,alphafold_core_features_pair_row_mean,alphafold_core_features_distogram_row_max,alphafold_core_features_pair_row_max,alphafold_core_features_distogram_col_mean,alphafold_core_features_pair_col_mean,alphafold_core_features_distogram_col_max,alphafold_core_features_pair_col_max,alphafold_core_features_distogram_diag,alphafold_core_features_pair_diag,alphafold_core_score_msa_logproba_first_wt,alphafold_core_score_msa_logproba_first_mut,alphafold_core_score_msa_logproba_mean_wt,alphafold_core_score_msa_logproba_mean_mut,alphafold_core_score_msa_logproba_max_wt,alphafold_core_score_msa_logproba_max_mut,alphafold_core_features_msa_first,alphafold_core_features_msa_mean,alphafold_core_features_msa_max
0,P26678,R9C,60.591231,"[1.1957175, 1.2158396, 1.4719502, 1.0110891, 1...","[-6.112609, -7.1444016, -6.2029376, -5.485528,...","[-1.8621206, 4.2266498, -12.190193, 5.9252286,...","[19.881775, -7.916237, 24.432047, -24.314926, ...","[0.0069257915, 0.010214679, -0.0057431664, 0.0...",17.316183,15.080112,0.260079,"[-4.09479360626294, -8.069414707330557, -8.000...","[2.7896990598394322, 10.789292047230097, -2.49...","[155.1204071044922, 2.5760769844055176, 3.8069...","[34.4866943359375, 747.40869140625, 18.8361473...","[-4.09479360626294, -8.069414707330557, -8.000...","[1.7381559366790147, 11.034781367159807, -4.02...","[155.1204071044922, 2.5760769844055176, 3.8069...","[34.4866943359375, 747.40869140625, 18.8361473...","[155.1204071044922, -9.181709289550781, -33.72...","[34.4866943359375, 747.40869140625, 18.8361473...",-0.002302,-9.826518,-1.083345,-4.675444,-0.000687,-3.604242,"[0.6741787195205688, 3.8331971168518066, -18.1...","[-0.5440018840838136, -1.4893573264437399, -2....","[4.5283708572387695, 11.694952964782715, 5.611..."
1,P26678,T8P,60.938517,"[-0.9147486, -0.931573, -1.0168346, -1.1108581...","[-5.815353, -7.58807, -6.37847, -5.5572815, -4...","[5.016503, 6.580977, -4.220338, 2.137466, -4.0...","[3.3123631, 13.751325, 40.752693, -40.537956, ...","[0.0055783167, 0.006282449, -0.0057957303, 0.0...",18.275015,16.204138,0.250928,"[-4.033567515703348, -7.808617578102992, -7.83...","[3.765430871110696, 9.19120905491022, -2.54373...","[156.63250732421875, 2.5760769844055176, 3.806...","[51.281803131103516, 737.6687622070312, 13.462...","[-4.033567515703348, -7.808617578102992, -7.83...","[1.717591334420901, 9.950064335304956, -2.4488...","[156.63250732421875, 2.5760769844055176, 3.806...","[51.281803131103516, 737.6687622070312, 13.462...","[156.63250732421875, -7.953685283660889, -38.0...","[51.281803131103516, 737.6687622070312, 13.462...",-0.001565,-8.594054,-1.364722,-4.070019,-0.000725,-3.079609,"[7.693594932556152, 13.351690292358398, -8.099...","[-0.5604841870175102, -0.5929731388730327, -1....","[7.693594932556152, 13.933950424194336, 2.5906..."


216263


### CCMpred

In [10]:
ccmpred_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_ccmpred", DATASET_NAME)

In [14]:
present_files, missing_files = get_result_files(ccmpred_result_dir)

len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4182.0), HTML(value='')))




(3715, 467)

In [15]:
result_ccmpred_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3715.0), HTML(value='')))




In [16]:
display(result_ccmpred_df.head(2))
print(len(result_ccmpred_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,ccmpred_logproba_i_wt,ccmpred_logproba_i_mut,ccmpred_logproba_ij_ab_wt,ccmpred_logproba_ji_ab_wt,ccmpred_logproba_ij_ba_wt,ccmpred_logproba_ji_ba_wt,ccmpred_logproba_ij_ab_mut,ccmpred_logproba_ji_ab_mut,ccmpred_logproba_ij_ba_mut,ccmpred_logproba_ji_ba_mut
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,3.80785,-0.696719,-3e-05,-3e-05,-0.000238,-0.000238,-6.5e-05,-6.5e-05,-0.000246,-0.000246
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,3.87664,-0.697235,-3.1e-05,-3.1e-05,-7.7e-05,-7.7e-05,-3.3e-05,-3.3e-05,0.000797,0.000797


183057


## Fix 1char protein_id columns

In [32]:
assert len(result_msa_df) == len(result_msa_df["mutation_id"].drop_duplicates()) 

mutation_id_to_protein_id = result_msa_df[["protein_id", "mutation_id"]].drop_duplicates().set_index("mutation_id").to_dict()

In [33]:
for name, df in [
    ("ps", result_ps_df),
    ("pb", result_pb_df),
    ("ra", result_ra_df),
    ("msa", result_msa_df),
    ("af", result_af_df),
]:
    print(
        name,
        len(df),
        df["protein_id"].isnull().sum(),
        df["protein_id"].str.len().value_counts().to_dict(),
    )

ps 221816 0 {6: 221270, 1: 546}
pb 221793 0 {6: 221247, 1: 546}
ra 221793 0 {6: 221793}
msa 221793 0 {6: 221793}
af 216263 0 {6: 216263}


In [34]:
mask = result_ps_df["protein_id"].str.len() == 1
result_ps_df.loc[mask, "protein_id"] = result_ps_df.loc[mask, "mutation_id"].map(mutation_id_to_protein_id)

In [35]:
mask = result_pb_df["protein_id"].str.len() == 1
result_pb_df.loc[mask, "protein_id"] = result_pb_df.loc[mask, "mutation_id"].map(mutation_id_to_protein_id)

In [36]:
for name, df in [
    ("ps", result_ps_df),
    ("pb", result_pb_df),
    ("ra", result_ra_df),
    ("msa", result_msa_df),
    ("af", result_af_df),
]:
    print(
        name,
        len(df),
        df["protein_id"].isnull().sum(),
        df["protein_id"].str.len().value_counts().to_dict(),
    )

ps 221816 546 {6.0: 221270}
pb 221793 546 {6.0: 221247}
ra 221793 0 {6: 221793}
msa 221793 0 {6: 221793}
af 216263 0 {6: 216263}


## Combine results

In [37]:
len(result_ps_df) == len(result_ps_df["mutation_id"].unique())

True

In [38]:
result_df = (
    result_ps_df.merge(
        result_pb_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left"
    )
    .merge(
        result_ra_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left"
    )
    .merge(result_msa_df, on=["protein_id", "mutation", "mutation_id"], how="left")
    .merge(result_af_df, on=["protein_id", "mutation"], how="left")
    .merge(result_ccmpred_df, on=["protein_id", "mutation"], how="left")
)

# assert len(result_df) == len(result_df["mutation_id"].unique())
assert not (
    set(result_ps_df["mutation_id"])
    | set(result_pb_df["mutation_id"])
    | set(result_ra_df["mutation_id"])
    | set(result_msa_df["mutation_id"])
    # | set(result_af_df["mutation_id"])
) - set(result_df["mutation_id"])

In [39]:
display(result_df.head(2))
print(len(result_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,proteinsolver_core_score_wt,proteinsolver_core_score_mut,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,proteinsolver_core_features_residue_mut,proteinsolver_core_features_protein_mut,protbert_core_score_wt,protbert_core_score_mut,protbert_core_features_residue_wt,protbert_core_features_protein_wt,protbert_core_features_residue_mut,protbert_core_features_protein_mut,rosetta_cart_bonded_wt,rosetta_rama_prepro_wt,rosetta_ref_wt,rosetta_hxl_tors_wt,rosetta_p_aa_pp_wt,rosetta_fa_dun_semi_wt,rosetta_fa_dun_rot_wt,rosetta_fa_dun_dev_wt,rosetta_omega_wt,rosetta_dslf_fa13_wt,rosetta_hbond_sc_wt,rosetta_hbond_bb_sc_wt,rosetta_hbond_lr_bb_wt,rosetta_hbond_sr_bb_wt,rosetta_fa_intra_elec_wt,rosetta_fa_elec_wt,rosetta_lk_ball_bridge_uncpl_wt,rosetta_lk_ball_bridge_wt,rosetta_lk_ball_iso_wt,rosetta_lk_ball_wt,rosetta_fa_intra_sol_xover4_wt,rosetta_fa_intra_rep_xover4_wt,rosetta_fa_intra_atr_xover4_wt,rosetta_fa_sol_wt,rosetta_fa_rep_wt,rosetta_fa_atr_wt,rosetta_dg_wt,rosetta_cart_bonded_change,rosetta_rama_prepro_change,rosetta_ref_change,rosetta_hxl_tors_change,rosetta_p_aa_pp_change,rosetta_fa_dun_semi_change,rosetta_fa_dun_rot_change,rosetta_fa_dun_dev_change,rosetta_omega_change,rosetta_dslf_fa13_change,rosetta_hbond_sc_change,rosetta_hbond_bb_sc_change,rosetta_hbond_lr_bb_change,rosetta_hbond_sr_bb_change,rosetta_fa_intra_elec_change,rosetta_fa_elec_change,rosetta_lk_ball_bridge_uncpl_change,rosetta_lk_ball_bridge_change,rosetta_lk_ball_iso_change,rosetta_lk_ball_change,rosetta_fa_intra_sol_xover4_change,rosetta_fa_intra_rep_xover4_change,rosetta_fa_intra_atr_xover4_change,rosetta_fa_sol_change,rosetta_fa_rep_change,rosetta_fa_atr_change,rosetta_dg_change,msa_count_wt,msa_count_mut,msa_count_total,msa_proba_wt,msa_proba_mut,msa_proba_total,msa_length,msa_proba,msa_H,msa_KL,alphafold_core_score_plddt,alphafold_core_features_experimentally_resolved,alphafold_core_features_predicted_lddt,alphafold_core_features_msa_first_row,alphafold_core_features_single,alphafold_core_features_structure_module,alphafold_core_score_predicted_aligned_error_row_mean,alphafold_core_score_predicted_aligned_error_col_mean,alphafold_core_score_predicted_aligned_error_diag,alphafold_core_features_distogram_row_mean,alphafold_core_features_pair_row_mean,alphafold_core_features_distogram_row_max,alphafold_core_features_pair_row_max,alphafold_core_features_distogram_col_mean,alphafold_core_features_pair_col_mean,alphafold_core_features_distogram_col_max,alphafold_core_features_pair_col_max,alphafold_core_features_distogram_diag,alphafold_core_features_pair_diag,alphafold_core_score_msa_logproba_first_wt,alphafold_core_score_msa_logproba_first_mut,alphafold_core_score_msa_logproba_mean_wt,alphafold_core_score_msa_logproba_mean_mut,alphafold_core_score_msa_logproba_max_wt,alphafold_core_score_msa_logproba_max_mut,alphafold_core_features_msa_first,alphafold_core_features_msa_mean,alphafold_core_features_msa_max
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.091593,0.003965,"[0.48249053955078125, 0.24484702944755554, -0....","[-2.27313494682312, 0.6932704448699951, -0.195...","[-1.904166340827942, 1.0505309104919434, 0.088...","[-2.3197972774505615, 0.8016344308853149, -0.1...",0.068529,0.007849,"[0.15305250883102417, -0.11011786758899689, 0....","[0.05504663661122322, -0.04564127326011658, 0....","[0.09072457253932953, -0.12460881471633911, 0....","[0.02796473540365696, -0.055114783346652985, 0...",34.796667,18.080333,13.08,5.875333,-4.878333,33.826333,30.42,37.201667,16.900333,0.0,0.0,-0.495,0.0,-46.239,-5.748,-96.294333,-1.515667,-0.220333,-145.437333,123.757667,12.501667,9.146667,-22.659333,279.312667,18.309,-311.028667,-1.308,-0.577667,0.614667,4.553,0.021333,0.376,0.173,-1.592,-0.074667,-0.012667,0.0,0.0,0.495,0.0,-0.011,-0.510333,0.204667,0.508667,0.04,1.679,-0.050333,-0.104333,-0.147,0.325,-4.542667,0.084667,2.215667,3.669,87.0,0.0,100.0,-0.310155,-4.787492,-88.633441,101.0,-0.536553,0.326639,2.625994,60.591231,"[1.1957175, 1.2158396, 1.4719502, 1.0110891, 1...","[-6.112609, -7.1444016, -6.2029376, -5.485528,...","[-1.8621206, 4.2266498, -12.190193, 5.9252286,...","[19.881775, -7.916237, 24.432047, -24.314926, ...","[0.0069257915, 0.010214679, -0.0057431664, 0.0...",17.316183,15.080112,0.260079,"[-4.09479360626294, -8.069414707330557, -8.000...","[2.7896990598394322, 10.789292047230097, -2.49...","[155.1204071044922, 2.5760769844055176, 3.8069...","[34.4866943359375, 747.40869140625, 18.8361473...","[-4.09479360626294, -8.069414707330557, -8.000...","[1.7381559366790147, 11.034781367159807, -4.02...","[155.1204071044922, 2.5760769844055176, 3.8069...","[34.4866943359375, 747.40869140625, 18.8361473...","[155.1204071044922, -9.181709289550781, -33.72...","[34.4866943359375, 747.40869140625, 18.8361473...",-0.002302,-9.826518,-1.083345,-4.675444,-0.000687,-3.604242,"[0.6741787195205688, 3.8331971168518066, -18.1...","[-0.5440018840838136, -1.4893573264437399, -2....","[4.5283708572387695, 11.694952964782715, 5.611..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.041601,0.000594,"[2.043767213821411, 0.9361261129379272, -0.415...","[-2.27313494682312, 0.6932705044746399, -0.195...","[0.12865065038204193, 0.6873304843902588, 2.20...","[-2.2835092544555664, 0.6785972118377686, -0.1...",0.062104,0.02722,"[0.056363195180892944, -0.020384633913636208, ...","[0.05504663661122322, -0.04564127326011658, 0....","[0.03299185633659363, -0.007563109043985605, 0...","[0.04508848860859871, -0.052698392421007156, 0...",35.531,17.849,13.08,9.764,-5.052,33.549,30.357,43.524,16.889,0.0,0.0,-0.743,0.0,-45.885,-4.684,-97.176,-1.163,-0.196,-144.679,123.189,12.223,9.169,-22.419,278.164,18.065,-310.411,8.945,12.124333,2.623,-3.697,-2.731667,0.545667,1.057667,0.810667,0.199333,1.852333,0.0,-1.577,0.007667,0.0,1.358,0.671,1.583667,0.171667,0.014,0.872,0.017,-0.134,-0.1,0.435333,-1.319333,6.835,-1.977333,19.644,95.0,0.0,100.0,-0.223144,-4.787492,-89.393727,101.0,-0.536553,0.074447,2.917469,60.938517,"[-0.9147486, -0.931573, -1.0168346, -1.1108581...","[-5.815353, -7.58807, -6.37847, -5.5572815, -4...","[5.016503, 6.580977, -4.220338, 2.137466, -4.0...","[3.3123631, 13.751325, 40.752693, -40.537956, ...","[0.0055783167, 0.006282449, -0.0057957303, 0.0...",18.275015,16.204138,0.250928,"[-4.033567515703348, -7.808617578102992, -7.83...","[3.765430871110696, 9.19120905491022, -2.54373...","[156.63250732421875, 2.5760769844055176, 3.806...","[51.281803131103516, 737.6687622070312, 13.462...","[-4.033567515703348, -7.808617578102992, -7.83...","[1.717591334420901, 9.950064335304956, -2.4488...","[156.63250732421875, 2.5760769844055176, 3.806...","[51.281803131103516, 737.6687622070312, 13.462...","[156.63250732421875, -7.953685283660889, -38.0...","[51.281803131103516, 737.6687622070312, 13.462...",-0.001565,-8.594054,-1.364722,-4.070019,-0.000725,-3.079609,"[7.693594932556152, 13.351690292358398, -8.099...","[-0.5604841870175102, -0.5929731388730327, -1....","[7.693594932556152, 13.933950424194336, 2.5906..."


221816


## Calculate EL2 score

In [40]:
model = el2.ELASPIC2()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
Some weights of the model checkpoint at /gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2/src/elaspic2/plugins/protbert/data/prot_bert_bfd were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClas

In [41]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_core_score")]
protbert_columns = [c for c in result_df if c.startswith("protbert_core_")]

el2_missing = result_df[proteinsolver_columnms + protbert_columns].isnull().any(axis=1)
result_df["el2_score"] = np.nan
result_df.loc[~el2_missing, "el2_score"] = model.predict_mutation_effect(
    [
        t._asdict()
        for t in result_df.loc[~el2_missing, proteinsolver_columnms + protbert_columns].itertuples(
            index=False
        )
    ]
)

## Calculate deltas

In [42]:
for column in list(result_df):
    if not column.endswith("_mut"):
        continue

    column_wt = column.removesuffix("_mut") + "_wt"
    if column_wt not in result_df:
        print(column_wt)
        continue

    column_change = column.removesuffix("_mut") + "_change"
    result_df[column_change] = result_df[column_wt] - result_df[column]
    del result_df[column]

## Encode mutation

In [43]:
# For historic reasons, have to use analyze_msa.RESIDUE_ENCODING_WT/MUT
result_df["aa_wt_onehot"] = (
    result_df["mutation"].str[0].map(analyze_msa.RESIDUE_ENCODING_WT).apply(list)
)
result_df["aa_mut_onehot"] = (
    result_df["mutation"].str[-1].map(analyze_msa.RESIDUE_ENCODING_MUT).apply(list)
)

## Save results

In [44]:
output_file = NOTEBOOK_DIR.joinpath("combined-results.parquet")

output_file

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/37_cagi6_sherloc_combine_results/combined-results.parquet')

In [45]:
pq.write_table(
    pa.Table.from_pandas(result_df, preserve_index=False),
    output_file,
    row_group_size=10_000,
)

## Exploratory data analysis

In [46]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [47]:
result_df["effect"].value_counts()

Uncertain significance    147067
Likely benign              11398
Benign                      9730
Pathogenic                  5834
Likely pathogenic           2506
Name: effect, dtype: int64

In [48]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [49]:
score_columns = [
    "el2_score",
    "alphafold_core_score_msa_logproba_mean_change",
    "protbert_core_score_change",
    "proteinsolver_core_score_change",
    "msa_KL",
    "rosetta_dg_change",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
df = df[df["effect_score"] != 0].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

el2_score 0.5009009313320716 0.8733030112155488
alphafold_core_score_msa_logproba_mean_change 0.490884351852328 0.8601820569629077
protbert_core_score_change 0.42966405016149745 0.8192752936611206
proteinsolver_core_score_change 0.29156189775258534 0.712874183165052
msa_KL 0.35563523056628843 0.7697417905539332
rosetta_dg_change 0.24768921716995493 0.6824360622464656


In [50]:
score_columns = [
    "el2_score",
    "alphafold_core_score_msa_logproba_mean_change",
    "protbert_core_score_change",
    "proteinsolver_core_score_change",
    "msa_KL",
    "rosetta_dg_change",
]

for column in score_columns:
    print(f"{column} {result_df[column].isnull().sum()}")

el2_score 23
alphafold_core_score_msa_logproba_mean_change 546
protbert_core_score_change 23
proteinsolver_core_score_change 0
msa_KL 569
rosetta_dg_change 714
