## Summary

---

## Imports

In [None]:
import functools
import json
import pickle
import shutil
import tempfile
import uuid
from collections import Counter
from pathlib import Path

import elaspic2 as el2
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import optuna.integration.lightgbm as olgb
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics, model_selection
from sklearn.decomposition import PCA
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

In [None]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [None]:
try:
    UNIQUE_ID
except NameError:
    UNIQUE_ID = str(uuid.uuid4())[:8]

UNIQUE_ID

In [None]:
NOTEBOOK_DIR = Path("38_cagi6_sherloc_train_model").resolve()
NOTEBOOK_DIR.joinpath(UNIQUE_ID).mkdir(parents=True, exist_ok=True)

NOTEBOOK_DIR

In [None]:
USE_ALPHAFOLD = "wt"
assert USE_ALPHAFOLD in [None, "wt", "wt+mut"]

USE_ALPHAFOLD

## Load results

In [None]:
input_file = NOTEBOOK_DIR.parent.joinpath(
    "37_cagi6_sherloc_combine_results", "combined-results.parquet"
)

input_file

In [None]:
result_df = pq.read_table(input_file).to_pandas()

display(result_df.head(2))
print(len(result_df))

## Exploratory data analysis

In [None]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [None]:
result_df["effect"].value_counts()

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "rosetta_dg_change",
    "alphafold_core_scores_residue_plddt_wt",
    "alphafold_core_scores_proten_ptm_wt",
    #         "alphafold_core_scores_residue_plddt_change",
    #         "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
df = df[df["effect_score"] != 0].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    precision = metrics.average_precision_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc, precision)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "msa_KL",
    "rosetta_dg_change",
    "alphafold_core_scores_residue_plddt_wt",
    "alphafold_core_scores_residue_plddt_change",
    #         "alphafold_core_scores_protein_plddt_wt",
    #         "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    #         "alphafold_core_scores_proten_ptm_wt",
]

for column in score_columns:
    print(f"{column} {result_df[column].isnull().sum()}")

## Train ML models

### Prepare dataset

In [None]:
rosetta_columns = [c for c in result_df if "rosetta" in c]
msa_columns = [c for c in result_df if c.startswith("msa")]

len(rosetta_columns), len(msa_columns)

In [None]:
alphafold_columns = []

if USE_ALPHAFOLD and "wt" in USE_ALPHAFOLD:
    alphafold_columns += [
        "alphafold_core_scores_residue_plddt_wt",
        "alphafold_core_scores_proten_ptm_wt",
    ]

if USE_ALPHAFOLD and "mut" in USE_ALPHAFOLD:
    alphafold_columns += [
        "alphafold_core_scores_residue_plddt_change",
    ]

In [None]:
scalar_features = (
    [
        # "el2_score",
        "protbert_core_score_wt",
        "protbert_core_score_change",
        "proteinsolver_core_score_wt",
        "proteinsolver_core_score_change",
    ]
    + msa_columns
    + rosetta_columns
    + alphafold_columns
)

with NOTEBOOK_DIR.joinpath(UNIQUE_ID, "scalar-features.json").open("wt") as fout:
    json.dump(scalar_features, fout)

In [None]:
vector_features = [
    "aa_wt_onehot",
    "aa_mut_onehot",
    "protbert_core_features_residue_wt",
    "protbert_core_features_residue_change",
    "proteinsolver_core_features_residue_wt",
    "proteinsolver_core_features_residue_change",
]


if USE_ALPHAFOLD and "wt" in USE_ALPHAFOLD:
    vector_features += [
        "alphafold_core_features_residue_experimentally_resolved_wt",  # 0.19 [37]
        "alphafold_core_features_residue_predicted_lddt_wt",  # 0.17 [50]
        "alphafold_core_features_residue_msa_first_row_wt",  # 0.17 [256]
        "alphafold_core_features_residue_single_wt",  # 0.20 [384]
        "alphafold_core_features_residue_structure_module_wt",  # 0.18 [384]
    ]

if USE_ALPHAFOLD and "mut" in USE_ALPHAFOLD:
    vector_features += [
        "alphafold_core_features_residue_experimentally_resolved_change",  # 0.11 [37]
        "alphafold_core_features_residue_predicted_lddt_change",  # 0.04 [50]
        "alphafold_core_features_residue_msa_first_row_change",  # 0.21 [256]
        "alphafold_core_features_residue_single_change",  # 0.15 [384]
        "alphafold_core_features_residue_structure_module_change",  # 0.05 [384]
    ]

with NOTEBOOK_DIR.joinpath(UNIQUE_ID, "vector-features.json").open("wt") as fout:
    json.dump(vector_features, fout)

In [None]:
df = (
    result_df.dropna(
        subset=scalar_features
        + vector_features
        + [
            "effect_score",
        ]
    )
    .drop_duplicates(subset=["protein_id", "mutation"])
    .sort_values("protein_id")
    .reset_index(drop=True)
)
df = df[df["effect_score"] != 0]

# protein_map = {k: i for i, k in enumerate(df["protein_id"].unique())}
# groups = df["protein_id"].map(protein_map).values

value_counts = df["protein_id"].value_counts()
groups = df["protein_id"].drop_duplicates().map(value_counts)

X_ref = np.c_[
    df[scalar_features].values,
    np.hstack([np.vstack(df[col].values) for col in vector_features]),
]
X = X_ref
# X = X[:, important_features]

low_confidence_mask = df["effect_score"] == 0

y = (df["effect_score"] > 0).values.astype(int)
y[low_confidence_mask] = (df[low_confidence_mask]["el2_score"] > 2).values.astype(int)
# y = df["effect_score"].values

# weights = np.ones(len(df), dtype=np.float64)
# weights[df["effect_score"] == 1] = 0.5
# weights[df["effect_score"] == -1] = 0.5
# weights[low_confidence_mask] = 0.05

In [None]:
Counter(y)

### Tune hyperparameters

In [None]:
CONST_PARAM = {
    "objective": "binary",
    "metric": "binary_logloss",
    "is_unbalance": True,
}

In [None]:
def training_loop(X, y, param, n_splits=6, progressbar=False):
    models = []
    preds = np.ones(len(y), dtype=np.float64) * np.nan
    gkf = GroupKFold(n_splits=n_splits)
    for train_index, test_index in tqdm(
        gkf.split(X, y, groups=df["protein_id"]),
        total=n_splits,
        disable=not progressbar,
    ):
        X_training, X_testing = X[train_index], X[test_index]
        y_training, y_testing = y[train_index], y[test_index]
        # weights_training, weights_testing = weights[train_index], weights[test_index]

        dtrain = lgb.Dataset(
            X_training,
            label=y_training,
            # weight=weights_training,
        )
        model = lgb.train(param, dtrain)
        preds[test_index] = model.predict(X_testing)
        models.append(model)
    return models, preds

In [None]:
def objective(trial, X, y, low_confidence_mask):
    param = CONST_PARAM | {
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 64),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 60),
    }
    models, preds = training_loop(X, y, param)
    pred_labels = np.rint(preds)
    accuracy = metrics.accuracy_score(y[~low_confidence_mask], pred_labels[~low_confidence_mask])
    auc = metrics.roc_auc_score(y[~low_confidence_mask], preds[~low_confidence_mask])
    precision = metrics.average_precision_score(
        y[~low_confidence_mask], preds[~low_confidence_mask]
    )
    return precision

In [None]:
dtrain = lgb.Dataset(X, label=y, group=groups)

with tempfile.TemporaryDirectory() as model_dir:
    tuner = olgb.LightGBMTunerCV(
        CONST_PARAM | {"verbosity": -1},
        dtrain,
        verbose_eval=200,
        early_stopping_rounds=250,
        folds=GroupKFold(n_splits=6),
        num_boost_round=1000,
        model_dir=model_dir,
        time_budget=60 * 60 * 2,
        return_cvbooster=True,
    )
    tuner.run()
    booster = tuner.get_best_booster()

In [None]:
param = tuner.best_params

print(tuner.best_score)
print(param)

In [None]:
with NOTEBOOK_DIR.joinpath(UNIQUE_ID, "best-parameters-starting.json").open("wt") as fout:
    json.dump(param, fout)

In [None]:
for bst_idx, bst in enumerate(booster.boosters):
    bst.save_model(str(NOTEBOOK_DIR.joinpath(UNIQUE_ID, f"model-starting-{bst_idx}.txt")))

_ = dtrain.save_binary(str(NOTEBOOK_DIR.joinpath(UNIQUE_ID, "training-data.bin")))

### Feature elimination

In [None]:
def get_feature_index(num_features, features_to_exclude):
    idx = 0
    seen = set(features_to_exclude)
    indices = list(reversed(range(num_features)))
    while indices:
        idx = indices.pop()
        while idx in seen:
            idx += 1
        seen.add(idx)
        yield idx


list(get_feature_index(10, {1, 2, 3, 9, 10}))

In [None]:
feature_elimination_stats_file = NOTEBOOK_DIR.joinpath(
    UNIQUE_ID, "feature-elimination-stats.pickle"
)
feature_elimination_stats_file.touch()

features_to_exclude = {}

fe_round = -1
fe_round_stats = []
best_params = {3000: None, 2000: None, 1000: None, 500: None}
highest_precision = None
while len(features_to_exclude) < X_ref.shape[1]:
    fe_round += 1

    # Apply feature elimination mask
    feature_mask = np.ones(X_ref.shape[1], dtype=bool)
    feature_mask[np.array(list(features_to_exclude), dtype=int)] = False
    X = X_ref[:, feature_mask]
    assert len(features_to_exclude) == X_ref.shape[1] - X.shape[1]

    # Retune parameters
    for cutoff, best_param in list(best_params.items()):
        if X.shape[1] < cutoff and best_param is None:
            dtrain = lgb.Dataset(X, label=y, group=groups)
            tuner = olgb.LightGBMTunerCV(
                CONST_PARAM | {"verbosity": -1},
                dtrain,
                verbose_eval=200,
                early_stopping_rounds=250,
                folds=GroupKFold(n_splits=6),
                num_boost_round=1000,
                time_budget=60 * 60 * 2,
            )
            tuner.run()
            best_params[cutoff] = tuner.best_params
            param = tuner.best_params
            with NOTEBOOK_DIR.joinpath(UNIQUE_ID, f"best-parameters-{cutoff}.json").open(
                "wt"
            ) as fout:
                json.dump(param, fout)

    # Re-train models
    models, preds = training_loop(X, y, param, progressbar=True)

    # Calculate model statistics
    corr = stats.spearmanr(y[~low_confidence_mask], preds[~low_confidence_mask])
    auc = metrics.roc_auc_score(y[~low_confidence_mask], preds[~low_confidence_mask])
    precision = metrics.average_precision_score(
        y[~low_confidence_mask], preds[~low_confidence_mask]
    )

    # Save stats
    round_stats = (
        fe_round,
        corr[0],
        auc,
        precision,
        len(features_to_exclude),
        X.shape[1],
        list(features_to_exclude),
    )
    print(round_stats[:-1])
    fe_round_stats.append(round_stats)
    shutil.copyfile(
        feature_elimination_stats_file, feature_elimination_stats_file.with_suffix(".pickle.bak")
    )
    with feature_elimination_stats_file.open("wb") as fout:
        pickle.dump(fe_round_stats, fout, pickle.HIGHEST_PROTOCOL)

    # Save best models
    if highest_precision is None or precision > highest_precision:
        highest_precision = precision
        for model_idx, model in enumerate(models):
            model.save_model(
                str(NOTEBOOK_DIR.joinpath(UNIQUE_ID, f"model-{fe_round}-{model_idx}.txt"))
            )

    # Find new features to eliminate
    feature_importance_split = np.vstack(
        [model.feature_importance("split") for model in models]
    ).sum(axis=0)
    feature_importance_gain = np.vstack([model.feature_importance("gain") for model in models]).sum(
        axis=0
    )

    feature_df = pd.DataFrame(
        {
            "feature_idx": list(
                get_feature_index(len(feature_importance_split), features_to_exclude)
            ),
            "feature_importance_split": feature_importance_split,
            "feature_importance_gain": feature_importance_gain,
        }
    )

    if (feature_df["feature_importance_split"] == 0).any():
        feature_df = feature_df[feature_df["feature_importance_split"] == 0]
    else:
        num_features_to_drop = max(1, len(feature_df) // 100)
        feature_df = feature_df.sort_values(
            ["feature_importance_split", "feature_importance_gain"], ascending=True
        ).iloc[:num_features_to_drop]
    new_features_to_exclude = feature_df["feature_idx"].values.tolist()

    for feature_idx in new_features_to_exclude:
        assert feature_idx not in features_to_exclude
        features_to_exclude[feature_idx] = fe_round

In [None]:
print("Done")