## Summary

---

## Imports

In [None]:
import functools
import json
import sys
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from elaspic2.plugins.msa_analysis import analyze_msa
from scipy import stats
from sklearn import metrics, model_selection
from sklearn.decomposition import PCA
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

In [None]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [None]:
NOTEBOOK_DIR = Path("40_cagi6_sherloc_submission").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
src_dir = str(NOTEBOOK_DIR.parents[1].joinpath("src"))
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

import helpers

## Load data

In [None]:
submission_template_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "submission_template.tsv"
)

submission_template_df = pd.read_csv(submission_template_file, sep="\t")

display(submission_template_df.head(2))
len(submission_template_df)

### `training_df`

In [None]:
training_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc",
    "CAGI6-Sherloc-clinical-classification",
    "final_train_070821.txt",
)

In [None]:
training_all_df = pd.read_csv(training_file, sep="\t")

display(training_all_df.head(2))
len(training_all_df)

In [None]:
assert not set(submission_template_df["hgvs"]) & set(training_all_df["hgvs"])

In [None]:
training_df = training_all_df[training_all_df["Subcategory Missense"]]

display(training_df.head(2))
len(training_df)

### `testing_df`

In [None]:
testing_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "CAGI6-Sherloc-clinical-classification", "final_test_070821.txt"
)

In [None]:
testing_all_df = pd.read_csv(testing_file, sep="\t")

display(testing_all_df.head(2))
len(testing_all_df)

In [None]:
assert not set(submission_template_df["hgvs"]) ^ set(testing_all_df["hgvs"])

In [None]:
testing_df = testing_all_df[testing_all_df["Subcategory Missense"]]

display(testing_df.head(2))
len(testing_df)

### `submission_template_df`

In [None]:
submission_template_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "submission_template.tsv"
)

In [None]:
submission_template_df = pd.read_csv(submission_template_file, sep="\t")

display(submission_template_df.head(2))
len(submission_template_df)

### `validation_df`

In [None]:
validation_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "validation_variants.tsv"
)

In [None]:
validation_all_df = pd.read_csv(validation_file, sep="\t").rename(
    columns={"HGVS.c": "hgvs_g"}
)

display(validation_all_df.tail(2))
len(validation_all_df)

In [None]:
def map_g_to_c(str_g, mapper):
    from hgvs.exceptions import HGVSInvalidIntervalError, HGVSUsageError

    var_g = hp.parse_hgvs_variant(str_g)
    for tx_ac in mapper.relevant_transcripts(var_g):
        try:
            var_c = mapper.g_to_c(var_g, tx_ac)
        except (HGVSUsageError, HGVSInvalidIntervalError):
            continue
        yield var_c

In [None]:
validation_mapping_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "validation-mapping-grch37.parquet"
)

if validation_mapping_file.is_file():
    validation_mapping_df = pq.read_table(validation_mapping_file).to_pandas()
else:
    import hgvs.assemblymapper
    import hgvs.dataproviders.uta
    import hgvs.parser

    hp = hgvs.parser.Parser()
    hdp = hgvs.dataproviders.uta.connect()
    mapper = hgvs.assemblymapper.AssemblyMapper(
        hdp, assembly_name="GRCh37", normalize=False
    )

    results = []
    for tup in tqdm(validation_all_df.itertuples(), total=len(validation_all_df)):
        for var_c in map_g_to_c(tup.hgvs_g, mapper):
            results.append((tup.str_g, str(var_c)))
    validation_mapping_df = pd.DataFrame(results, columns=["hgvs_g", "hgvs"])
    pq.write_table(
        pa.Table.from_pandas(validation_mapping_df, preserve_index=False),
        validation_mapping_file,
    )

In [None]:
display(validation_mapping_df.head(2))
print(len(validation_mapping_df))

In [None]:
validation_df = (
    validation_all_df
    #
    .merge(validation_mapping_df, on=["hgvs_g"], how="left")
)

In [None]:
display(validation_df.tail(2))
print(len(validation_df))
print(len(validation_df["hgvs_g"].unique()))

## Load results

In [None]:
input_file = NOTEBOOK_DIR.parent.joinpath(
    "37_cagi6_sherloc_combine_results", "combined-results.parquet"
)

input_file

In [None]:
result_df = pq.read_table(input_file).to_pandas()

display(result_df.head(2))
print(len(result_df))

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

## Exploratory data analysis

In [None]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [None]:
result_df["effect"].value_counts()

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "rosetta_dg_change",
    #     "alphafold_core_scores_residue_plddt_wt",
    #     "alphafold_core_scores_protein_plddt_wt",
    #     "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    #     "alphafold_core_scores_proten_ptm_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
# df = df[df["effect_score"].isin([-1, 1])].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

## Load ML models and make predictions

In [None]:
prediction_df = result_df.copy()

In [None]:
model_infos = [
    ("f6be01c3", "optimized"),
    ("7f9826be", "optimized"),
    ("900500fe", "optimized"),
]

for unique_id, model_type in tqdm(model_infos):
    scalar_features, vector_features = helpers.load_features(
        NOTEBOOK_DIR.parent, unique_id
    )
    features_to_exclude = helpers.load_features_to_exclude(
        NOTEBOOK_DIR.parent, unique_id, model_type
    )
    best_parameters = helpers.load_best_parameters(
        NOTEBOOK_DIR.parent, unique_id, model_type
    )
    models = helpers.load_best_models(NOTEBOOK_DIR.parent, unique_id, model_type)

    mask = ~prediction_df[scalar_features + vector_features].isnull().any(axis=1)

    X_ref = np.c_[
        prediction_df.loc[mask, scalar_features].values,
        np.hstack(
            [np.vstack(prediction_df.loc[mask, col].values) for col in vector_features]
        ),
    ]

    if features_to_exclude is not None:
        feature_mask = np.ones(X_ref.shape[1], dtype=bool)
        feature_mask[np.array(features_to_exclude, dtype=int)] = False
        X = X_ref[:, feature_mask]
        assert len(features_to_exclude) == X_ref.shape[1] - X.shape[1]
    else:
        X = X_ref

    for model_idx, model in enumerate(models):
        prediction_df.loc[
            mask, f"pred_{unique_id}_{model_type}_{model_idx}"
        ] = model.predict(X)

    pred_column = f"pred_{unique_id}_{model_type}"
    prediction_df[pred_column] = prediction_df.loc[
        mask,
        [
            f"pred_{unique_id}_{model_type}_{model_idx}"
            for model_idx in range(len(models))
        ],
    ].mean(axis=1)

## Evaluate predictions

### See validation score

### Submit predictions to leaderboard

In [None]:
pred_column = "pred_f6be01c3_optimized"

validation_submission_df = validation_df.assign(
    is_missense=lambda df: df["hgvs"].notnull()
).merge(
    prediction_df.rename(columns={pred_column: "pred"}),
    left_on=["hgvs"],
    right_on=["mutation_id"],
    how="left",
)

assert len(validation_submission_df) == len(validation_df)

median = np.nanmedian(validation_submission_df["pred"])
print(f"{median=}")

out = (
    validation_submission_df.groupby("hgvs_g")
    .agg({"pred": np.nanmean, "is_missense": "max"})
    .fillna(median)
    .reset_index()
)

median_adj = median / out["pred"].max()
out["pred"] = out["pred"] / out["pred"].max()

out.head()

In [None]:
plt.hist(out[out["pred"] != median_adj]["pred"], bins=100)
None

In [None]:
out[["hgvs_g", "pred"]].to_csv(
    NOTEBOOK_DIR.joinpath("submission-valid-9.tsv"), index=False, sep="\t"
)

### Final submission

In [None]:
submission_score_columns = [
    "pred_f6be01c3_optimized",
    "pred_7f9826be_optimized",
    "pred_900500fe_optimized",
    "el2_score",
    "protbert_core_score_change",
    "proteinsolver_core_score_change",
    #     "rosetta_dg_change",
]

In [None]:
prediction_df[submission_score_columns].corr()

In [None]:
assert len(prediction_df) == len(prediction_df["mutation_id"].unique())

In [None]:
final_submission_df = submission_template_df.merge(
    prediction_df[["mutation_id"] + submission_score_columns],
    left_on=["hgvs"],
    right_on=["mutation_id"],
    how="left",
)

assert len(final_submission_df) == len(submission_template_df)

In [None]:
output_dir = NOTEBOOK_DIR.joinpath("submission")
output_dir.mkdir(exist_ok=True)

output_dir

In [None]:
%%file {output_dir}/strokach_desc.md
# Submission for CAGI6—Sherloc challenge

**Please note:**

Due to technical limitations and time constraints, we did not make predictions for all missense mutations.
It would be great if you could also try evaluating our submission using solely those mutations for which we made predictions (mutations without a prediction have a comment: "No prediction (dummy score)").

## Overview

- `strokach_modelnumber_1.tsv` → Predictions made using ELASPIC2 with AlphaFold [4] features for wildtype protein (trained using both Sherloc and humsavar data).
- `strokach_modelnumber_2.tsv` → Predictions made using ELASPIC2 with AlphaFold [4] features for wildtype protein (trained only using Sherloc data).
- `strokach_modelnumber_3.tsv` → Predictions made using ELASPIC2 with AlphaFold [4] features for wildtype and mutant proteins (trained only using Sherloc data).
- `strokach_modelnumber_4.tsv` → Predictions made using ELASPIC2 [1].
- `strokach_modelnumber_5.tsv` → Predictions made using ProteinSolver [2].
- `strokach_modelnumber_6.tsv` → Predictions made using ProtBert [3].
<!-- - `strokach_modelnumber_6.tsv` → Predictions made using Rosetta's cartesian_ddg protocol [5]. -->

## References

- [1] Strokach et al. (2021). _ELASPIC2 (EL2): Combining Contextualized Language Models and Graph Neural Networks to Predict Effects of Mutations._ https://doi.org/10.1016/j.jmb.2021.166810
- [2] Strokach et al. (2020). _Fast and Flexible Protein Design Using Deep Graph Neural Networks._ https://doi.org/10.1016/j.cels.2020.08.016
- [3] Elnaggar et al. (2020). _ProtTrans: Towards Cracking the Language of Life’s Code Through Self-Supervised Deep Learning and High Performance Computing._ https://doi.org/10.1101/2020.07.12.199554
- [4] Jumper et al. (2021). _Highly accurate protein structure prediction with AlphaFold._ https://doi.org/10.1038/s41586-021-03819-2
- [5] Park et al. (2016). _Simultaneous Optimization of Biomolecular Energy Functions on Features from Small Molecules and Macromolecules._ https://doi.org/10.1021/acs.jctc.6b00819

In [None]:
# fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)

In [None]:
for submission_idx, score_column in enumerate(submission_score_columns):
    print(score_column)
    submission_df = final_submission_df.copy()
    submission_df["score"] = submission_df[score_column]

    comment_mask = submission_df["score"].isnull()
    print(score_column, comment_mask.sum())

    if submission_df["score"].min() < 0 or submission_df["score"].max() > 1:
        print(f"Reweighting submission for {score_column}.")
        submission_df["score"] = submission_df["score"] - submission_df["score"].min()
        submission_df["score"] = submission_df["score"] / submission_df["score"].max()
    assert submission_df["score"].min() >= 0 and submission_df["score"].max() <= 1

    if score_column.startswith("pred_"):
        if "f6be01c3" in score_column:
            cutoff = 0.3570548221184613
        elif any((x in score_column for x in ["7f9826be", "900500fe"])):
            cutoff = 0.2830188679245283
        else:
            raise Exception
    else:
        cutoff = 0.5

    submission_df["score"] = submission_df["score"].fillna(cutoff)
    submission_df["class"] = (submission_df["score"] > cutoff).astype(int)

    submission_df.loc[comment_mask, "comment"] = "No prediction (dummy score)"

    display(submission_df.head(2))

    output_file = output_dir.joinpath(f"strokach_modelnumber_{submission_idx + 1}.tsv")
    submission_df[["hgvs", "score", "class", "comment"]].to_csv(
        output_file, sep="\t", index=False
    )

In [None]:
!cd {output_dir.parent} && tar -czf {output_dir.name}.tar.gz {output_dir.name}