## Summary

---

## Imports

In [1]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics
from tqdm.auto import tqdm

## Parameters

In [2]:
NOTEBOOK_DIR = Path("45_validate_models").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/45_validate_models')

In [3]:
src_dir = str(NOTEBOOK_DIR.parents[1].joinpath("src"))
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

import helpers

## Load results

In [4]:
input_file = NOTEBOOK_DIR.parent.joinpath(
    "37_cagi6_sherloc_combine_results", "combined-results.parquet"
)

input_file

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/37_cagi6_sherloc_combine_results/combined-results.parquet')

In [5]:
result_df = pq.read_table(input_file).to_pandas()

display(result_df.head(2))
print(len(result_df))

Unnamed: 0,protein_id,mutation,mutation_id,effect,proteinsolver_core_score_wt,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,protbert_core_score_wt,protbert_core_features_residue_wt,protbert_core_features_protein_wt,...,alphafold_core_features_residue_msa_first_row_change,alphafold_core_features_residue_single_change,alphafold_core_features_residue_structure_module_change,alphafold_core_features_protein_experimentally_resolved_change,alphafold_core_features_protein_predicted_lddt_change,alphafold_core_features_protein_msa_first_row_change,alphafold_core_features_protein_single_change,alphafold_core_features_protein_structure_module_change,aa_wt_onehot,aa_mut_onehot
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.091593,"[0.48249053955078125, 0.24484702944755554, -0....","[-2.27313494682312, 0.6932704448699951, -0.195...",0.068529,"[0.15305250883102417, -0.11011786758899689, 0....","[0.05504663661122322, -0.04564127326011658, 0....",...,"[-0.24461997, 0.91860914, -1.4374704, 1.841849...","[7.351156, 8.393639, 3.1830235, 3.0640717, 4.0...","[9.148568e-05, -2.7619302e-05, 3.2782555e-07, ...","[0.0022777617, 0.0021982193, 0.0029729605, 0.0...","[-0.004014015, -0.0029330254, -0.003218174, -0...","[-0.019475102, -0.0052657127, -0.04385376, -0....","[0.09133339, 0.074448586, -0.10094261, 0.12374...","[2.8892653e-05, -2.5316142e-05, -7.459894e-07,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.041601,"[2.043767213821411, 0.9361261129379272, -0.415...","[-2.27313494682312, 0.6932705044746399, -0.195...",0.062104,"[0.056363195180892944, -0.020384633913636208, ...","[0.05504663661122322, -0.04564127326011658, 0....",...,"[-0.4901538, 1.5533419, -1.5304286, -0.1852448...","[1.8987266, -0.4226532, 0.6206093, 0.038326263...","[6.455183e-05, -0.00016468763, 7.186085e-06, 4...","[-0.006253898, -0.0062743723, -0.006733477, -0...","[-0.012162209, -0.018994808, -0.011818409, -0....","[-0.03768146, -0.016304016, -0.041838408, 0.01...","[0.08028793, -0.019423485, -0.1275692, -0.0226...","[-2.5912654e-05, 3.2503158e-05, 5.8985315e-06,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."


221816


In [6]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

## Exploratory data analysis

In [7]:
result_df["effect"].value_counts()

Uncertain significance    147067
Likely benign              11398
Benign                      9730
Pathogenic                  5834
Likely pathogenic           2506
Name: effect, dtype: int64

In [8]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [9]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "rosetta_dg_change",
    #     "alphafold_core_scores_residue_plddt_wt",
    #     "alphafold_core_scores_protein_plddt_wt",
    #     "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    #     "alphafold_core_scores_proten_ptm_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
# df = df[df["effect_score"].isin([-1, 1])].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

el2_score 0.19271749853401182 0.819760687160709
proteinsolver_core_score_change 0.11067870025313856 0.6783941301907526
protbert_core_score_change 0.1620701018909018 0.7812989848001559
rosetta_dg_change 0.10357751324165007 0.6450253131239799


## Combine

In [10]:
result_df.head(2)

Unnamed: 0,protein_id,mutation,mutation_id,effect,proteinsolver_core_score_wt,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,protbert_core_score_wt,protbert_core_features_residue_wt,protbert_core_features_protein_wt,...,alphafold_core_features_residue_single_change,alphafold_core_features_residue_structure_module_change,alphafold_core_features_protein_experimentally_resolved_change,alphafold_core_features_protein_predicted_lddt_change,alphafold_core_features_protein_msa_first_row_change,alphafold_core_features_protein_single_change,alphafold_core_features_protein_structure_module_change,aa_wt_onehot,aa_mut_onehot,effect_score
0,P26678,R9C,NM_002667.3:c.25C>T,Pathogenic,0.091593,"[0.48249053955078125, 0.24484702944755554, -0....","[-2.27313494682312, 0.6932704448699951, -0.195...",0.068529,"[0.15305250883102417, -0.11011786758899689, 0....","[0.05504663661122322, -0.04564127326011658, 0....",...,"[7.351156, 8.393639, 3.1830235, 3.0640717, 4.0...","[9.148568e-05, -2.7619302e-05, 3.2782555e-07, ...","[0.0022777617, 0.0021982193, 0.0029729605, 0.0...","[-0.004014015, -0.0029330254, -0.003218174, -0...","[-0.019475102, -0.0052657127, -0.04385376, -0....","[0.09133339, 0.074448586, -0.10094261, 0.12374...","[2.8892653e-05, -2.5316142e-05, -7.459894e-07,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0
1,P26678,T8P,NM_002667.3:c.22A>C,Uncertain significance,0.041601,"[2.043767213821411, 0.9361261129379272, -0.415...","[-2.27313494682312, 0.6932705044746399, -0.195...",0.062104,"[0.056363195180892944, -0.020384633913636208, ...","[0.05504663661122322, -0.04564127326011658, 0....",...,"[1.8987266, -0.4226532, 0.6206093, 0.038326263...","[6.455183e-05, -0.00016468763, 7.186085e-06, 4...","[-0.006253898, -0.0062743723, -0.006733477, -0...","[-0.012162209, -0.018994808, -0.011818409, -0....","[-0.03768146, -0.016304016, -0.041838408, 0.01...","[0.08028793, -0.019423485, -0.1275692, -0.0226...","[-2.5912654e-05, 3.2503158e-05, 5.8985315e-06,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.0


## Load ML models and make predictions

In [11]:
prediction_df = result_df.copy()

In [12]:
model_infos = [
    # `base` + `AFwt`
    ("7f9826be", "initial"),
    ("7f9826be", "optimized"),
    # `base` + `AFwt`
    ("fd28687b", "initial"),
    ("fd28687b", "optimized"),
    # `base` + `AFwt` + `AFmut`
    ("900500fe", "initial"),
    ("900500fe", "optimized"),
    # `base` + `AFwt` + `AFmut`
    ("be3bdad5", "initial"),
    ("be3bdad5", "optimized"),
    # `base` + `EL2` + `AFwt`
    ("6999e5aa", "initial"),
    ("6999e5aa", "optimized"),
    # `base` + `EL2` + `AFwt` + `AFmut` [no opt]
    ("4df6fd79", "initial"),
    # `base`
    ("0d59c727", "initial"),
    ("0d59c727", "optimized"),
    # `base - rosetta`
    ("eabf01fe", "initial"),
    ("eabf01fe", "optimized"),
]


for unique_id, model_type in tqdm(model_infos):
    scalar_features, vector_features = helpers.load_features(NOTEBOOK_DIR.parent, unique_id)
    features_to_exclude = helpers.load_features_to_exclude(
        NOTEBOOK_DIR.parent, unique_id, model_type
    )
    best_parameters = helpers.load_best_parameters(NOTEBOOK_DIR.parent, unique_id, model_type)
    models = helpers.load_best_models(NOTEBOOK_DIR.parent, unique_id, model_type)

    result_df.dropna(subset=scalar_features + vector_features).copy()
    mask = ~prediction_df[scalar_features + vector_features].isnull().any(axis=1)

    X_ref = np.c_[
        prediction_df.loc[mask, scalar_features].values,
        np.hstack([np.vstack(prediction_df.loc[mask, col].values) for col in vector_features]),
    ]

    if features_to_exclude is not None:
        feature_mask = np.ones(X_ref.shape[1], dtype=bool)
        feature_mask[np.array(features_to_exclude, dtype=int)] = False
        X = X_ref[:, feature_mask]
        assert len(features_to_exclude) == X_ref.shape[1] - X.shape[1]
    else:
        X = X_ref

    for model_idx, model in enumerate(models):
        prediction_df.loc[mask, f"pred_{unique_id}_{model_type}_{model_idx}"] = model.predict(X)

    pred_column = f"pred_{unique_id}_{model_type}"
    prediction_df[pred_column] = prediction_df.loc[
        mask, [f"pred_{unique_id}_{model_type}_{model_idx}" for model_idx in range(len(models))]
    ].mean(axis=1)