## Summary

---

## Imports

In [None]:
import json
import pickle
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics
from sklearn.model_selection import GroupKFold
from tqdm.auto import tqdm

In [None]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [None]:
UNIQUE_ID = "7f9826be"

UNIQUE_ID

In [None]:
NOTEBOOK_DIR = Path("39_cagi6_sherloc_finetune_model").resolve()
NOTEBOOK_DIR.joinpath(UNIQUE_ID).mkdir(parents=True, exist_ok=True)

NOTEBOOK_DIR

In [None]:
DATASET_NAME = "cagi6-sherloc"

DATASET_NAME

## Load data

### Results

In [None]:
input_file = NOTEBOOK_DIR.parent.joinpath(
    "37_cagi6_sherloc_combine_results", "combined-results.parquet"
)

input_file

In [None]:
result_df = pq.read_table(input_file).to_pandas()

display(result_df.head(2))
print(len(result_df))

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

### Features

In [None]:
with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "scalar-features.json"
).open("rb") as fin:
    scalar_features = json.load(fin)

with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "vector-features.json"
).open("rb") as fin:
    vector_features = json.load(fin)

len(scalar_features), len(vector_features)

### Stats

In [None]:
with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "feature-elimination-stats.pickle"
).open("rb") as fin:
    data = pickle.load(fin)

### Training dataframe

In [None]:
training_df = (
    result_df.dropna(
        subset=scalar_features
        + vector_features
        + [
            "effect_score",
        ]
    )
    .drop_duplicates(subset=["protein_id", "mutation"])
    .sort_values("protein_id")
    .reset_index(drop=True)
)
training_df = training_df[training_df["effect_score"] != 0]

# protein_map = {k: i for i, k in enumerate(training_df["protein_id"].unique())}
# groups = training_df["protein_id"].map(protein_map).values

value_counts = training_df["protein_id"].value_counts()
groups = training_df["protein_id"].drop_duplicates().map(value_counts)

X_ref = np.c_[
    training_df[scalar_features].values,
    np.hstack([np.vstack(training_df[col].values) for col in vector_features]),
]
X = X_ref
# X = X[:, important_features]

low_confidence_mask = training_df["effect_score"] == 0

y = (training_df["effect_score"] > 0).values.astype(int)
y[low_confidence_mask] = (
    training_df[low_confidence_mask]["el2_score"] > 2
).values.astype(int)
# y = training_df["effect_score"].values

weights = np.ones(len(training_df), dtype=np.float64)
weights[training_df["effect_score"] == 1] = 0.5
weights[training_df["effect_score"] == -1] = 0.5
weights[low_confidence_mask] = 0.05

## Find best row

In [None]:
feature_elimination_df = pd.DataFrame(
    data,
    columns=[
        "round",
        "spearman_r",
        "auc",
        "precision",
        "num_excluded_features",
        "num_present_features",
        "features_to_exclude",
    ],
)

In [None]:
df = feature_elimination_df.iloc[:-24]

plt.plot(df["round"], df["precision"])

In [None]:
plt.plot(df["num_present_features"], df["precision"])

In [None]:
best_row_idx = np.argmax(feature_elimination_df["precision"])

best_row = feature_elimination_df.iloc[best_row_idx]

best_row

## Load best models

In [None]:
feature_mask = np.ones(X_ref.shape[1], dtype=bool)
feature_mask[np.array(best_row["features_to_exclude"], dtype=int)] = False
X = X_ref[:, feature_mask]
assert len(best_row["features_to_exclude"]) == X_ref.shape[1] - X.shape[1]

In [None]:
with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "best-parameters-500.json"
).open("rb") as fin:
    best_parameters = json.load(fin)
    
best_parameters

In [None]:
if best_parameters["lambda_l1"] < 1e-8:
    best_parameters["lambda_l1"] = 1e-8

if best_parameters["lambda_l2"] < 1e-8:
    best_parameters["lambda_l2"] = 1e-8

## Tune hyperparameters

In [None]:
CONST_PARAM = {"objective": "binary", "metric": "average_precision", "verbose": -1}

In [None]:
def training_loop(X, y, weights, param, n_splits=6, progressbar=False):
    models = []
    preds = np.ones(len(y), dtype=np.float64) * np.nan
    gkf = GroupKFold(n_splits=n_splits)
    for train_index, test_index in tqdm(
        gkf.split(X, y, groups=training_df["protein_id"]),
        total=n_splits,
        disable=not progressbar,
    ):
        X_training, X_testing = X[train_index], X[test_index]
        y_training, y_testing = y[train_index], y[test_index]
        weights_training, weights_testing = weights[train_index], weights[test_index]

        dtrain = lgb.Dataset(X_training, label=y_training, weight=weights_training)
        model = lgb.train(param, dtrain, num_boost_round=1000)
        preds[test_index] = model.predict(X_testing)
        models.append(model)
    return models, preds

In [None]:
def objective(trial, X, y, weights, low_confidence_mask):
    param = CONST_PARAM | {
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 300),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.0, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.0, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 7),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 200),
    }
    models, preds = training_loop(X, y, weights, param)
    pred_labels = np.rint(preds)
    accuracy = metrics.accuracy_score(y[~low_confidence_mask], pred_labels[~low_confidence_mask])
    auc = metrics.roc_auc_score(y[~low_confidence_mask], preds[~low_confidence_mask])
    precision = metrics.average_precision_score(
        y[~low_confidence_mask], preds[~low_confidence_mask]
    )
    return precision

In [None]:
study = optuna.create_study(direction="maximize")
study.enqueue_trial(best_parameters)
study.optimize(lambda trial: objective(trial, X, y, weights, low_confidence_mask), n_trials=200)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
final_best_parameters = study.best_trial.params

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

## Validation 

In [None]:
models, preds = training_loop(X, y, weights, CONST_PARAM | final_best_parameters, progressbar=True)

corr = stats.spearmanr(y[~low_confidence_mask], preds[~low_confidence_mask])
auc = metrics.roc_auc_score(y[~low_confidence_mask], preds[~low_confidence_mask])
precision = metrics.average_precision_score(y[~low_confidence_mask], preds[~low_confidence_mask])
print(corr[0], auc, precision)

## Save best parameters and model

In [None]:
with NOTEBOOK_DIR.joinpath(UNIQUE_ID, "best-parameters.json").open("wt") as fout:
    json.dump(best_parameters, fout)

with NOTEBOOK_DIR.joinpath(UNIQUE_ID, "best-features-to-exclude.json").open(
    "wt"
) as fout:
    json.dump(best_row["features_to_exclude"], fout)

for model_idx, model in enumerate(models):
    model.save_model(
        str(NOTEBOOK_DIR.joinpath(UNIQUE_ID, f"best-model-{model_idx}.txt"))
    )