## Summary

---

## Imports

In [None]:
import sys
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics, model_selection
from tqdm.auto import tqdm

In [None]:
pd.set_option("max_rows", 1000)
pd.set_option("max_columns", 1000)

## Parameters

In [None]:
NOTEBOOK_DIR = Path("45_validate_models").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
src_dir = str(NOTEBOOK_DIR.parents[1].joinpath("src"))
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

import helpers

## Load results

In [None]:
input_file = NOTEBOOK_DIR.parent.joinpath(
    "37_cagi6_sherloc_combine_results", "combined-results.parquet"
)

input_file

In [None]:
result_df = pq.read_table(input_file).to_pandas()

display(result_df.head(2))
print(len(result_df))

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

## Exploratory data analysis

In [None]:
result_df["effect"].value_counts()

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "rosetta_dg_change",
    #     "alphafold_core_scores_residue_plddt_wt",
    #     "alphafold_core_scores_protein_plddt_wt",
    #     "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    #     "alphafold_core_scores_proten_ptm_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
# df = df[df["effect_score"].isin([-1, 1])].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

## Combine

In [None]:
result_df.head(2)

## Load ML models and make predictions

In [None]:
def training_loop(X, y, *, weights, groups, param, n_splits=6, progressbar=False):
    models = []
    preds = np.ones(len(y), dtype=np.float64) * np.nan
    gkf = model_selection.GroupKFold(n_splits=n_splits)
    for train_index, test_index in tqdm(
        gkf.split(X, y, groups=groups),
        total=n_splits,
        disable=not progressbar,
    ):
        X_training, X_testing = X[train_index], X[test_index]
        y_training, y_testing = y[train_index], y[test_index]
        weights_training, weights_testing = weights[train_index], weights[test_index]

        groups_training = groups.iloc[train_index]
        value_counts = groups_training.value_counts()
        group_adj = groups_training.drop_duplicates().map(value_counts)

        dtrain = lgb.Dataset(
            X_training, label=y_training, weight=weights_training, group=group_adj
        )
        model = lgb.train(param, dtrain, num_boost_round=1000)
        preds[test_index] = model.predict(X_testing)
        models.append(model)
    return models, preds

In [None]:
model_infos = [
    # `base` + `AFwt`
    #     ("7f9826be", "initial"),
    ("7f9826be", "optimized"),
    # `base` + `AFwt`
    #     ("fd28687b", "initial"),
    ("fd28687b", "optimized"),
    # `base` + `AFwt` + `AFmut`
    #     ("900500fe", "initial"),
    #     ("900500fe", "optimized"),
    # `base` + `AFwt` + `AFmut`
    #     ("be3bdad5", "initial"),
    #     ("be3bdad5", "optimized"),
    # `base` + `EL2` + `AFwt`
    #     ("6999e5aa", "initial"),
    ("6999e5aa", "optimized"),
    # `base` + `EL2` + `AFwt` + `AFmut` [no opt]
    #     ("4df6fd79", "initial"),
    # `base`
    #     ("0d59c727", "initial"),
    ("0d59c727", "optimized"),
    # `base - rosetta`
    #     ("eabf01fe", "initial"),
    ("eabf01fe", "optimized"),
    # `base - rosetta - ps`
    ("a7b1c747", "optimized"),
]

In [None]:
features = []
for unique_id, model_type in tqdm(model_infos):
    scalar_features, vector_features = helpers.load_features(
        NOTEBOOK_DIR.parent, unique_id
    )
    features += scalar_features + vector_features


prediction_ref_df = result_df[
    result_df["effect_score"].notnull() & (result_df["effect_score"] != 0)
]
prediction_df = (
    prediction_ref_df[prediction_ref_df[list(set(features))].notnull().all(axis=1)]
    .sort_values("protein_id")
    .copy()
)
print(
    f"Lost {len(prediction_ref_df) - len(prediction_df)} rows due to missing features."
)

In [None]:
for unique_id, model_type in tqdm(model_infos):
    scalar_features, vector_features = helpers.load_features(
        NOTEBOOK_DIR.parent, unique_id
    )
    features_to_exclude = helpers.load_features_to_exclude(
        NOTEBOOK_DIR.parent, unique_id, model_type
    )
    best_parameters = helpers.load_best_parameters(
        NOTEBOOK_DIR.parent, unique_id, model_type
    )

    X_ref = np.c_[
        prediction_df[scalar_features].values,
        np.hstack([np.vstack(prediction_df[col].values) for col in vector_features]),
    ]

    if features_to_exclude is not None:
        feature_mask = np.ones(X_ref.shape[1], dtype=bool)
        feature_mask[np.array(features_to_exclude, dtype=int)] = False
        X = X_ref[:, feature_mask]
        assert len(features_to_exclude) == X_ref.shape[1] - X.shape[1]
    else:
        X = X_ref

    low_confidence_mask = (prediction_df["effect_score"] == 0).values

    y = (prediction_df["effect_score"] > 0).values.astype(int)
    y[low_confidence_mask] = (
        prediction_df[low_confidence_mask]["el2_score"] > 2
    ).values.astype(int)
    #     y = prediction_df["effect_score"].values

    weights = np.ones(len(prediction_df), dtype=np.float64)
    weights[prediction_df["effect_score"] == 1] = 0.5
    weights[prediction_df["effect_score"] == -1] = 0.5
    weights[low_confidence_mask] = 0.05

    protein_ids = prediction_df["protein_id"]

    models, preds = training_loop(
        X,
        y,
        weights=weights,
        groups=protein_ids,
        param=best_parameters | {"objective": "binary"},
    )

    pred_column = f"pred_{unique_id}_{model_type}"
    prediction_df[pred_column] = preds

#     for model_idx, model in enumerate(models):
#         prediction_df[f"pred_{unique_id}_{model_type}_{model_idx}"] = model.predict(X)

#     prediction_df[pred_column] = prediction_df[
#         [
#             f"pred_{unique_id}_{model_type}_{model_idx}"
#             for model_idx in range(len(models))
#         ]
#     ].mean(axis=1)

In [None]:
cols = [f"pred_{unique_id}_{model_type}" for unique_id, model_type in model_infos]

df = prediction_df[prediction_df["effect_score"] != 0]

model_stats = []
for col in cols:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    precision = metrics.average_precision_score(df["effect_score"] > 0, df[col])
    model_stats.append((col, len(df), corr[0], auc, precision))

model_stats_df = pd.DataFrame(
    model_stats, columns=["col", "num_rows", "spearman_rho", "auc", "precision"]
)

In [None]:
plt.hist(prediction_df[cols[0]], bins=100)
None

---

```csv
col	num_rows	spearman_rho	auc	precision
7	pred_be3bdad5_optimized	17253	0.660275	0.948133	0.905960
5	pred_900500fe_optimized	17253	0.661431	0.948289	0.905941
1	pred_7f9826be_optimized	17253	0.658835	0.946360	0.903161
3	pred_fd28687b_optimized	17253	0.660638	0.946663	0.902854
9	pred_6999e5aa_optimized	17253	0.656857	0.944983	0.900826
12	pred_0d59c727_optimized	17253	0.637160	0.935759	0.888277
14	pred_eabf01fe_optimized	17253	0.629048	0.930742	0.880559
```

regression_l1

```csv
col	num_rows	spearman_rho	auc	precision
3	pred_be3bdad5_optimized	17253	0.637726	0.934685	0.881122
2	pred_900500fe_optimized	17253	0.634744	0.932937	0.879540
4	pred_6999e5aa_optimized	17253	0.634151	0.932642	0.878677
0	pred_7f9826be_optimized	17253	0.632483	0.930091	0.873714
1	pred_fd28687b_optimized	17253	0.629686	0.928276	0.872713
5	pred_0d59c727_optimized	17253	0.603354	0.914350	0.855177
6	pred_eabf01fe_optimized	17253	0.608284	0.914428	0.853633
```

regression_l1 on full dataset

```csv
col	num_rows	spearman_rho	auc	precision
2	pred_900500fe_optimized	17253	0.630979	0.928408	0.876794
3	pred_be3bdad5_optimized	17253	0.624719	0.925131	0.869726
4	pred_6999e5aa_optimized	17253	0.625411	0.923101	0.868643
1	pred_fd28687b_optimized	17253	0.623880	0.923605	0.866927
0	pred_7f9826be_optimized	17253	0.619522	0.919911	0.860476
5	pred_0d59c727_optimized	17253	0.582443	0.899871	0.836196
6	pred_eabf01fe_optimized	17253	0.584318	0.900190	0.835659
```

---

In [None]:
model_stats_df[model_stats_df["col"].str.endswith("_optimized")].sort_values(
    "precision", ascending=False
)

In [None]:
# pq.write_table(pa.Table.from_pandas(model_stats_df, preserve_index=False), NOTEBOOK_DIR.joinpath("model-stats.parquet"))