## Summary

---

## Imports

In [1]:
import functools
from pathlib import Path

import elaspic2 as el2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics, model_selection
from tqdm.auto import tqdm



In [2]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [70]:
NOTEBOOK_DIR = Path("37_cagi6_sherloc_combine_results").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/37_cagi6_sherloc_combine_results')

## Load results

In [4]:
DATASET_NAME = "cagi6-sherloc"
TASK_COUNT = 4182

DATASET_NAME, TASK_COUNT

('cagi6-sherloc', 4182)

In [5]:
def get_result_files(result_dir):
    present_files = []
    missing_files = []
    for i in tqdm(range(1, TASK_COUNT + 1)):
        path = result_dir.joinpath(f"shard-{i}-of-{TASK_COUNT}.parquet")
        if path.is_file():
            present_files.append(path)
        else:
            missing_files.append(path)
    return present_files, missing_files

In [6]:
def read_files(files):
    dfs = []
    for file in tqdm(files):
        df = pq.read_table(file).to_pandas(integer_object_nulls=True)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

### ProteinSolver

In [7]:
ps_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_proteinsolver", DATASET_NAME)

In [8]:
present_files, missing_files = get_result_files(ps_result_dir)

len(present_files), len(missing_files)

  0%|          | 0/4182 [00:00<?, ?it/s]

(4182, 0)

In [9]:
result_ps_df = read_files(present_files)

  0%|          | 0/4182 [00:00<?, ?it/s]

In [10]:
display(result_ps_df.head(2))
print(len(result_ps_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,proteinsolver_core_score_wt,proteinsolver_core_score_mut,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,proteinsolver_core_features_residue_mut,proteinsolver_core_features_protein_mut
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.091593,0.003965,"[0.48249053955078125, 0.24484702944755554, -0....","[-2.27313494682312, 0.6932704448699951, -0.195...","[-1.904166340827942, 1.0505309104919434, 0.088...","[-2.3197972774505615, 0.8016344308853149, -0.1..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.041601,0.000594,"[2.043767213821411, 0.9361261129379272, -0.415...","[-2.27313494682312, 0.6932705044746399, -0.195...","[0.12865065038204193, 0.6873304843902588, 2.20...","[-2.2835092544555664, 0.6785972118377686, -0.1..."


221816


### ProtBert

In [11]:
pb_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_protbert", DATASET_NAME)

In [12]:
present_files, missing_files = get_result_files(pb_result_dir)

len(present_files), len(missing_files)

  0%|          | 0/4182 [00:00<?, ?it/s]

(4071, 111)

In [13]:
result_pb_df = read_files(present_files)

  0%|          | 0/4071 [00:00<?, ?it/s]

In [14]:
display(result_pb_df.head(2))
print(len(result_pb_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,protbert_core_score_wt,protbert_core_score_mut,protbert_core_features_residue_wt,protbert_core_features_protein_wt,protbert_core_features_residue_mut,protbert_core_features_protein_mut
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.068529,0.007849,"[0.15305250883102417, -0.11011786758899689, 0....","[0.05504663661122322, -0.04564127326011658, 0....","[0.09072457253932953, -0.12460881471633911, 0....","[0.02796473540365696, -0.055114783346652985, 0..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.062104,0.02722,"[0.056363195180892944, -0.020384633913636208, ...","[0.05504663661122322, -0.04564127326011658, 0....","[0.03299185633659363, -0.007563109043985605, 0...","[0.04508848860859871, -0.052698392421007156, 0..."


215728


### MSA

### AlphaFold

In [15]:
af_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_alphafold", DATASET_NAME)

In [16]:
present_files, missing_files = get_result_files(af_result_dir)

len(present_files), len(missing_files)

  0%|          | 0/4182 [00:00<?, ?it/s]

(2609, 1573)

In [17]:
result_af_df = read_files(present_files)

  0%|          | 0/2609 [00:00<?, ?it/s]

In [18]:
display(result_af_df.head(2))
print(len(result_af_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,alphafold_core_scores_residue_plddt_wt,alphafold_core_scores_protein_plddt_wt,alphafold_core_scores_protein_max_predicted_aligned_error_wt,alphafold_core_scores_proten_ptm_wt,alphafold_core_features_residue_experimentally_resolved_wt,alphafold_core_features_residue_predicted_lddt_wt,alphafold_core_features_residue_msa_first_row_wt,alphafold_core_features_residue_single_wt,alphafold_core_features_residue_structure_module_wt,alphafold_core_features_protein_experimentally_resolved_wt,alphafold_core_features_protein_predicted_lddt_wt,alphafold_core_features_protein_msa_first_row_wt,alphafold_core_features_protein_single_wt,alphafold_core_features_protein_structure_module_wt,alphafold_core_scores_residue_plddt_mut,alphafold_core_scores_protein_plddt_mut,alphafold_core_scores_protein_max_predicted_aligned_error_mut,alphafold_core_scores_proten_ptm_mut,alphafold_core_features_residue_experimentally_resolved_mut,alphafold_core_features_residue_predicted_lddt_mut,alphafold_core_features_residue_msa_first_row_mut,alphafold_core_features_residue_single_mut,alphafold_core_features_residue_structure_module_mut,alphafold_core_features_protein_experimentally_resolved_mut,alphafold_core_features_protein_predicted_lddt_mut,alphafold_core_features_protein_msa_first_row_mut,alphafold_core_features_protein_single_mut,alphafold_core_features_protein_structure_module_mut
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,60.585841,76.416854,31.75,0.427586,"[1.1943495, 1.2143694, 1.4703865, 1.009636, 1....","[-6.1107526, -7.1425943, -6.2013907, -5.484081...","[-1.8559321, 4.2250576, -12.1879635, 5.93118, ...","[19.911064, -7.9082317, 24.44313, -24.324299, ...","[0.0069303215, 0.010208584, -0.0057431404, 0.0...","[0.47212097, 0.49641332, 0.6147766, 0.47432983...","[-6.029762, -7.716437, -6.686019, -5.9195952, ...","[1.6403109, 2.4395254, -3.7391555, 2.6988323, ...","[8.636584, 8.638843, 21.079756, -9.497276, -2....","[0.0024530103, 0.010949803, -0.00598975, 0.002...",60.582598,76.403723,31.75,0.427867,"[1.0641325, 1.082403, 1.2580698, 0.9167514, 1....","[-6.103961, -7.1548443, -6.2082458, -5.489683,...","[-1.6113122, 3.3064485, -10.750493, 4.08933, -...","[12.559908, -16.30187, 21.260107, -27.38837, 8...","[0.006838836, 0.010236204, -0.005743468, 0.001...","[0.4698432, 0.4942151, 0.61180365, 0.47257632,...","[-6.025748, -7.713504, -6.682801, -5.9158244, ...","[1.659786, 2.444791, -3.6953018, 2.7094986, -4...","[8.545251, 8.564394, 21.180698, -9.621022, -2....","[0.0024241176, 0.010975119, -0.005989004, 0.00..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,60.939363,76.416854,31.75,0.427586,"[-0.9143661, -0.9311734, -1.0163504, -1.110276...","[-5.8135767, -7.5865355, -6.377211, -5.5560055...","[5.013483, 6.58278, -4.2263947, 2.135174, -4.0...","[3.3193378, 13.778628, 40.745605, -40.561554, ...","[0.005573705, 0.0062753484, -0.005795421, 0.00...","[0.47212097, 0.49641332, 0.6147766, 0.47432983...","[-6.029762, -7.716437, -6.686019, -5.9195952, ...","[1.6403109, 2.4395254, -3.7391555, 2.6988323, ...","[8.636584, 8.638843, 21.079756, -9.497276, -2....","[0.0024530103, 0.010949803, -0.00598975, 0.002...",60.87805,76.477451,31.75,0.428083,"[-0.82701945, -0.8436323, -0.9510052, -1.03694...","[-5.8004627, -7.553446, -6.3580575, -5.5442915...","[5.503637, 5.029438, -2.695966, 2.3204188, -4....","[1.4206113, 14.201282, 40.124996, -40.59988, -...","[0.005509153, 0.006440036, -0.005802607, 0.002...","[0.47837487, 0.5026877, 0.6215101, 0.4798556, ...","[-6.0175996, -7.697442, -6.6742005, -5.9090075...","[1.6779923, 2.4558294, -3.6973171, 2.6875021, ...","[8.556296, 8.658266, 21.207325, -9.4746275, -2...","[0.002478923, 0.010917299, -0.0059956484, 0.00..."


109692


### AlphaFold WT

In [36]:
afwt_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_alphafold_wt", DATASET_NAME)

In [37]:
present_files, missing_files = get_result_files(afwt_result_dir)

len(present_files), len(missing_files)

  0%|          | 0/4182 [00:00<?, ?it/s]

(381, 3801)

In [43]:
result_afwt_df = read_files(present_files).set_index("protein_id")

  0%|          | 0/381 [00:00<?, ?it/s]

In [44]:
display(result_afwt_df.head(2))
print(len(result_afwt_df))

Unnamed: 0_level_0,plddt,max_predicted_aligned_error,ptm,experimentally_resolved,predicted_lddt,msa_first_row,single,structure_module
protein_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q8N5M1,"[35.972237013727856, 26.117611200477302, 25.28...",31.75,0.757356,"[[-1.6529428958892822, -1.745883822441101, -2....","[[-3.664194107055664, -4.06497859954834, -2.82...","[[-2.9925522804260254, 7.615219593048096, -5.5...","[[-65.88994598388672, -1.3906073570251465, -8....","[[0.012274660170078278, 0.004892393946647644, ..."
Q13216,"[70.15048890426986, 77.02306691216653, 78.4936...",31.75,0.855871,"[[-0.2973601818084717, -0.24902477860450745, -...","[[-5.376760005950928, -5.810971260070801, -5.3...","[[-6.265456676483154, -0.4766066074371338, -5....","[[-3.569941520690918, 51.02355194091797, -2.75...","[[4.6797096729278564e-05, 0.011432409286499023..."


381


In [41]:
af_finished_protein_ids = set(
    result_af_df[["protein_id", "mutation", "mutation_id", "effect"]].apply(tuple, axis=1)
)

af_missing_protein_ids = (
    set(result_pb_df[["protein_id", "mutation", "mutation_id", "effect"]].apply(tuple, axis=1))
    - af_finished_protein_ids
)

len(af_missing_protein_ids)

109475

In [88]:
np.array([1,2,3]).astype(np.float32)

array([1., 2., 3.], dtype=float32)

In [92]:
def get_mutation_embeddings(idx, predictions):
    assert idx >= 0

    def as_residue(x):
        return x[idx].astype(np.float32)

    def as_protein(x):
        return x.mean(axis=0).astype(np.float32)

    embeddings = {
        "experimentally_resolved": predictions["experimentally_resolved"],
        "predicted_lddt": predictions["predicted_lddt"],
        "msa_first_row": predictions["msa_first_row"],
        "single": predictions["single"],
        "structure_module": predictions["structure_module"],
    }

    output = {
        "scores_residue_plddt": predictions["plddt"][idx],
        "scores_protein_plddt": np.mean(predictions["plddt"]),
        "scores_protein_max_predicted_aligned_error": predictions["max_predicted_aligned_error"],
        "scores_proten_ptm": predictions["ptm"],
        **{f"features_residue_{key}": as_residue(value) for key, value in embeddings.items()},
        **{f"features_protein_{key}": as_protein(value) for key, value in embeddings.items()},
    }

    return output


# get_mutation_embeddings(0, result_afwt_df.loc["Q8N5M1"])

In [93]:
results = []
missing = 0
proteins_with_multiple_rows = set()
for protein_id, mutation, mutation_id, effect in tqdm(af_missing_protein_ids):
    try:
        predictions = result_afwt_df.loc[protein_id]
    except KeyError:
        missing += 1
        continue

    if isinstance(predictions, pd.DataFrame):
        #         print(f"Multiple rows encountered for {protein_id=} ({len(predictions)=}).")
        proteins_with_multiple_rows.add(protein_id)
        predictions = predictions.iloc[0]

    features = {
        f"alphafold_core_{key}_wt": value
        for key, value in get_mutation_embeddings(int(mutation[1:-1]) - 1, predictions).items()
    }

    results.append(
        {
            "protein_id": protein_id,
            "mutation": mutation,
            "mutation_id": mutation_id,
            "effect": effect,
        }
        | features
    )

result_af2_df = pd.DataFrame(results)
len(result_af2_df), missing

  0%|          | 0/109475 [00:00<?, ?it/s]

(23622, 85853)

### Rosetta

In [19]:
ra_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_rosetta_ddg", DATASET_NAME)

In [20]:
present_files, missing_files = get_result_files(ra_result_dir)

len(present_files), len(missing_files)

  0%|          | 0/4182 [00:00<?, ?it/s]

(2289, 1893)

In [21]:
result_ra_df = read_files(present_files)

  0%|          | 0/2289 [00:00<?, ?it/s]

In [22]:
display(result_ra_df.head(2))
print(len(result_ra_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,rosetta_cart_bonded_wt,rosetta_rama_prepro_wt,rosetta_ref_wt,rosetta_hxl_tors_wt,rosetta_p_aa_pp_wt,rosetta_fa_dun_semi_wt,rosetta_fa_dun_rot_wt,rosetta_fa_dun_dev_wt,rosetta_omega_wt,rosetta_dslf_fa13_wt,rosetta_hbond_sc_wt,rosetta_hbond_bb_sc_wt,rosetta_hbond_lr_bb_wt,rosetta_hbond_sr_bb_wt,rosetta_fa_intra_elec_wt,rosetta_fa_elec_wt,rosetta_lk_ball_bridge_uncpl_wt,rosetta_lk_ball_bridge_wt,rosetta_lk_ball_iso_wt,rosetta_lk_ball_wt,rosetta_fa_intra_sol_xover4_wt,rosetta_fa_intra_rep_xover4_wt,rosetta_fa_intra_atr_xover4_wt,rosetta_fa_sol_wt,rosetta_fa_rep_wt,rosetta_fa_atr_wt,rosetta_dg_wt,rosetta_cart_bonded_change,rosetta_rama_prepro_change,rosetta_ref_change,rosetta_hxl_tors_change,rosetta_p_aa_pp_change,rosetta_fa_dun_semi_change,rosetta_fa_dun_rot_change,rosetta_fa_dun_dev_change,rosetta_omega_change,rosetta_dslf_fa13_change,rosetta_hbond_sc_change,rosetta_hbond_bb_sc_change,rosetta_hbond_lr_bb_change,rosetta_hbond_sr_bb_change,rosetta_fa_intra_elec_change,rosetta_fa_elec_change,rosetta_lk_ball_bridge_uncpl_change,rosetta_lk_ball_bridge_change,rosetta_lk_ball_iso_change,rosetta_lk_ball_change,rosetta_fa_intra_sol_xover4_change,rosetta_fa_intra_rep_xover4_change,rosetta_fa_intra_atr_xover4_change,rosetta_fa_sol_change,rosetta_fa_rep_change,rosetta_fa_atr_change,rosetta_dg_change
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,34.796667,18.080333,13.08,5.875333,-4.878333,33.826333,30.42,37.201667,16.900333,0.0,0.0,-0.495,0.0,-46.239,-5.748,-96.294333,-1.515667,-0.220333,-145.437333,123.757667,12.501667,9.146667,-22.659333,279.312667,18.309,-311.028667,-1.308,-0.577667,0.614667,4.553,0.021333,0.376,0.173,-1.592,-0.074667,-0.012667,0.0,0.0,0.495,0.0,-0.011,-0.510333,0.204667,0.508667,0.04,1.679,-0.050333,-0.104333,-0.147,0.325,-4.542667,0.084667,2.215667,3.669
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,35.531,17.849,13.08,9.764,-5.052,33.549,30.357,43.524,16.889,0.0,0.0,-0.743,0.0,-45.885,-4.684,-97.176,-1.163,-0.196,-144.679,123.189,12.223,9.169,-22.419,278.164,18.065,-310.411,8.945,12.124333,2.623,-3.697,-2.731667,0.545667,1.057667,0.810667,0.199333,1.852333,0.0,-1.577,0.007667,0.0,1.358,0.671,1.583667,0.171667,0.014,0.872,0.017,-0.134,-0.1,0.435333,-1.319333,6.835,-1.977333,19.644


103780


## Combine results

In [94]:
len(result_ps_df) == len(result_ps_df["mutation_id"].unique())

True

In [95]:
result_df = (
    result_ps_df.merge(
        result_pb_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left"
    )
    .merge(result_ra_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left")
    .merge(
        pd.concat([result_af_df, result_af2_df], ignore_index=True),
        on=["protein_id", "mutation", "mutation_id", "effect"],
        how="left",
    )
)

assert len(result_df) == len(result_df["mutation_id"].unique())
assert not (
    set(result_pb_df["mutation_id"])
    | set(result_ra_df["mutation_id"])
    | set(result_af_df["mutation_id"])
) - set(result_df["mutation_id"])

In [96]:
display(result_df.head(2))
print(len(result_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,proteinsolver_core_score_wt,proteinsolver_core_score_mut,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,proteinsolver_core_features_residue_mut,proteinsolver_core_features_protein_mut,protbert_core_score_wt,protbert_core_score_mut,protbert_core_features_residue_wt,protbert_core_features_protein_wt,protbert_core_features_residue_mut,protbert_core_features_protein_mut,rosetta_cart_bonded_wt,rosetta_rama_prepro_wt,rosetta_ref_wt,rosetta_hxl_tors_wt,rosetta_p_aa_pp_wt,rosetta_fa_dun_semi_wt,rosetta_fa_dun_rot_wt,rosetta_fa_dun_dev_wt,rosetta_omega_wt,rosetta_dslf_fa13_wt,rosetta_hbond_sc_wt,rosetta_hbond_bb_sc_wt,rosetta_hbond_lr_bb_wt,rosetta_hbond_sr_bb_wt,rosetta_fa_intra_elec_wt,rosetta_fa_elec_wt,rosetta_lk_ball_bridge_uncpl_wt,rosetta_lk_ball_bridge_wt,rosetta_lk_ball_iso_wt,rosetta_lk_ball_wt,rosetta_fa_intra_sol_xover4_wt,rosetta_fa_intra_rep_xover4_wt,rosetta_fa_intra_atr_xover4_wt,rosetta_fa_sol_wt,rosetta_fa_rep_wt,rosetta_fa_atr_wt,rosetta_dg_wt,rosetta_cart_bonded_change,rosetta_rama_prepro_change,rosetta_ref_change,rosetta_hxl_tors_change,rosetta_p_aa_pp_change,rosetta_fa_dun_semi_change,rosetta_fa_dun_rot_change,rosetta_fa_dun_dev_change,rosetta_omega_change,rosetta_dslf_fa13_change,rosetta_hbond_sc_change,rosetta_hbond_bb_sc_change,rosetta_hbond_lr_bb_change,rosetta_hbond_sr_bb_change,rosetta_fa_intra_elec_change,rosetta_fa_elec_change,rosetta_lk_ball_bridge_uncpl_change,rosetta_lk_ball_bridge_change,rosetta_lk_ball_iso_change,rosetta_lk_ball_change,rosetta_fa_intra_sol_xover4_change,rosetta_fa_intra_rep_xover4_change,rosetta_fa_intra_atr_xover4_change,rosetta_fa_sol_change,rosetta_fa_rep_change,rosetta_fa_atr_change,rosetta_dg_change,alphafold_core_scores_residue_plddt_wt,alphafold_core_scores_protein_plddt_wt,alphafold_core_scores_protein_max_predicted_aligned_error_wt,alphafold_core_scores_proten_ptm_wt,alphafold_core_features_residue_experimentally_resolved_wt,alphafold_core_features_residue_predicted_lddt_wt,alphafold_core_features_residue_msa_first_row_wt,alphafold_core_features_residue_single_wt,alphafold_core_features_residue_structure_module_wt,alphafold_core_features_protein_experimentally_resolved_wt,alphafold_core_features_protein_predicted_lddt_wt,alphafold_core_features_protein_msa_first_row_wt,alphafold_core_features_protein_single_wt,alphafold_core_features_protein_structure_module_wt,alphafold_core_scores_residue_plddt_mut,alphafold_core_scores_protein_plddt_mut,alphafold_core_scores_protein_max_predicted_aligned_error_mut,alphafold_core_scores_proten_ptm_mut,alphafold_core_features_residue_experimentally_resolved_mut,alphafold_core_features_residue_predicted_lddt_mut,alphafold_core_features_residue_msa_first_row_mut,alphafold_core_features_residue_single_mut,alphafold_core_features_residue_structure_module_mut,alphafold_core_features_protein_experimentally_resolved_mut,alphafold_core_features_protein_predicted_lddt_mut,alphafold_core_features_protein_msa_first_row_mut,alphafold_core_features_protein_single_mut,alphafold_core_features_protein_structure_module_mut
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.091593,0.003965,"[0.48249053955078125, 0.24484702944755554, -0....","[-2.27313494682312, 0.6932704448699951, -0.195...","[-1.904166340827942, 1.0505309104919434, 0.088...","[-2.3197972774505615, 0.8016344308853149, -0.1...",0.068529,0.007849,"[0.15305250883102417, -0.11011786758899689, 0....","[0.05504663661122322, -0.04564127326011658, 0....","[0.09072457253932953, -0.12460881471633911, 0....","[0.02796473540365696, -0.055114783346652985, 0...",34.796667,18.080333,13.08,5.875333,-4.878333,33.826333,30.42,37.201667,16.900333,0.0,0.0,-0.495,0.0,-46.239,-5.748,-96.294333,-1.515667,-0.220333,-145.437333,123.757667,12.501667,9.146667,-22.659333,279.312667,18.309,-311.028667,-1.308,-0.577667,0.614667,4.553,0.021333,0.376,0.173,-1.592,-0.074667,-0.012667,0.0,0.0,0.495,0.0,-0.011,-0.510333,0.204667,0.508667,0.04,1.679,-0.050333,-0.104333,-0.147,0.325,-4.542667,0.084667,2.215667,3.669,60.585841,76.416854,31.75,0.427586,"[1.1943495, 1.2143694, 1.4703865, 1.009636, 1....","[-6.1107526, -7.1425943, -6.2013907, -5.484081...","[-1.8559321, 4.2250576, -12.1879635, 5.93118, ...","[19.911064, -7.9082317, 24.44313, -24.324299, ...","[0.0069303215, 0.010208584, -0.0057431404, 0.0...","[0.47212097, 0.49641332, 0.6147766, 0.47432983...","[-6.029762, -7.716437, -6.686019, -5.9195952, ...","[1.6403109, 2.4395254, -3.7391555, 2.6988323, ...","[8.636584, 8.638843, 21.079756, -9.497276, -2....","[0.0024530103, 0.010949803, -0.00598975, 0.002...",60.582598,76.403723,31.75,0.427867,"[1.0641325, 1.082403, 1.2580698, 0.9167514, 1....","[-6.103961, -7.1548443, -6.2082458, -5.489683,...","[-1.6113122, 3.3064485, -10.750493, 4.08933, -...","[12.559908, -16.30187, 21.260107, -27.38837, 8...","[0.006838836, 0.010236204, -0.005743468, 0.001...","[0.4698432, 0.4942151, 0.61180365, 0.47257632,...","[-6.025748, -7.713504, -6.682801, -5.9158244, ...","[1.659786, 2.444791, -3.6953018, 2.7094986, -4...","[8.545251, 8.564394, 21.180698, -9.621022, -2....","[0.0024241176, 0.010975119, -0.005989004, 0.00..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.041601,0.000594,"[2.043767213821411, 0.9361261129379272, -0.415...","[-2.27313494682312, 0.6932705044746399, -0.195...","[0.12865065038204193, 0.6873304843902588, 2.20...","[-2.2835092544555664, 0.6785972118377686, -0.1...",0.062104,0.02722,"[0.056363195180892944, -0.020384633913636208, ...","[0.05504663661122322, -0.04564127326011658, 0....","[0.03299185633659363, -0.007563109043985605, 0...","[0.04508848860859871, -0.052698392421007156, 0...",35.531,17.849,13.08,9.764,-5.052,33.549,30.357,43.524,16.889,0.0,0.0,-0.743,0.0,-45.885,-4.684,-97.176,-1.163,-0.196,-144.679,123.189,12.223,9.169,-22.419,278.164,18.065,-310.411,8.945,12.124333,2.623,-3.697,-2.731667,0.545667,1.057667,0.810667,0.199333,1.852333,0.0,-1.577,0.007667,0.0,1.358,0.671,1.583667,0.171667,0.014,0.872,0.017,-0.134,-0.1,0.435333,-1.319333,6.835,-1.977333,19.644,60.939363,76.416854,31.75,0.427586,"[-0.9143661, -0.9311734, -1.0163504, -1.110276...","[-5.8135767, -7.5865355, -6.377211, -5.5560055...","[5.013483, 6.58278, -4.2263947, 2.135174, -4.0...","[3.3193378, 13.778628, 40.745605, -40.561554, ...","[0.005573705, 0.0062753484, -0.005795421, 0.00...","[0.47212097, 0.49641332, 0.6147766, 0.47432983...","[-6.029762, -7.716437, -6.686019, -5.9195952, ...","[1.6403109, 2.4395254, -3.7391555, 2.6988323, ...","[8.636584, 8.638843, 21.079756, -9.497276, -2....","[0.0024530103, 0.010949803, -0.00598975, 0.002...",60.87805,76.477451,31.75,0.428083,"[-0.82701945, -0.8436323, -0.9510052, -1.03694...","[-5.8004627, -7.553446, -6.3580575, -5.5442915...","[5.503637, 5.029438, -2.695966, 2.3204188, -4....","[1.4206113, 14.201282, 40.124996, -40.59988, -...","[0.005509153, 0.006440036, -0.005802607, 0.002...","[0.47837487, 0.5026877, 0.6215101, 0.4798556, ...","[-6.0175996, -7.697442, -6.6742005, -5.9090075...","[1.6779923, 2.4558294, -3.6973171, 2.6875021, ...","[8.556296, 8.658266, 21.207325, -9.4746275, -2...","[0.002478923, 0.010917299, -0.0059956484, 0.00..."


221816


## Calculate EL2 score

In [97]:
model = el2.ELASPIC2()



In [98]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_core_score")]
protbert_columns = [c for c in result_df if c.startswith("protbert_core_")]

el2_missing = result_df[proteinsolver_columnms + protbert_columns].isnull().any(axis=1)
result_df["el2_score"] = np.nan
result_df.loc[~el2_missing, "el2_score"] = model.predict_mutation_effect(
    [
        t._asdict()
        for t in result_df.loc[~el2_missing, proteinsolver_columnms + protbert_columns].itertuples(
            index=False
        )
    ]
)

## Calculate deltas

In [99]:
for column in list(result_df):
    if not column.endswith("_mut"):
        continue

    column_wt = column.removesuffix("_mut") + "_wt"
    if column_wt not in result_df:
        print(column_wt)
        continue

    column_change = column.removesuffix("_mut") + "_change"
    result_df[column_change] = result_df[column_wt] - result_df[column]
    del result_df[column]

## Encode mutation

In [100]:
amino_acids = list("ARNDCEQGHILKMFPSTWYV")
result_df["aa_wt_onehot"] = pd.get_dummies(result_df["mutation"].str[0]).apply(list, axis=1)
result_df["aa_mut_onehot"] = pd.get_dummies(result_df["mutation"].str[-1]).apply(list, axis=1)

## Save results

In [101]:
output_file = NOTEBOOK_DIR.joinpath("combined-results.parquet")

output_file

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/37_cagi6_sherloc_combine_results/combined-results.parquet')

In [103]:
pq.write_table(
    pa.Table.from_pandas(result_df, preserve_index=False),
    output_file,
    row_group_size=10_000,
)

## Exploratory data analysis

In [30]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [31]:
result_df["effect"].value_counts()

Uncertain significance    147067
Likely benign              11398
Benign                      9730
Pathogenic                  5834
Likely pathogenic           2506
Name: effect, dtype: int64

In [32]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [33]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "rosetta_dg_change",
    #     "alphafold_core_scores_residue_plddt_wt",
    #     "alphafold_core_scores_protein_plddt_wt",
    #     "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    #     "alphafold_core_scores_proten_ptm_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
df = df[df["effect_score"] != 0].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

el2_score 0.5069462415801477 0.8639912651215551
proteinsolver_core_score_change 0.3020300988279762 0.7120252529481002
protbert_core_score_change 0.4308051747020243 0.8082194043302708
rosetta_dg_change 0.2574446281608557 0.6799496305607562
