## Summary

```bash
export NOTEBOOK_PATH="$(realpath 39_cagi6_sherloc_finetune_model.ipynb)"

export UNIQUE_ID="7f2ea4f1"
sbatch --export NOTEBOOK_PATH,UNIQUE_ID --job-name="finetune-${UNIQUE_ID}" --time 24:00:00 --ntasks-per-node 40 --mem=32G --array=1-1 ../scripts/run_notebook_cpu.sh
```

---

## Imports

In [None]:
import json
import os
import pickle
import sys
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics, model_selection
from tqdm.auto import tqdm

In [None]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [None]:
NOTEBOOK_DIR = Path("39_cagi6_sherloc_finetune_model").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
src_dir = str(NOTEBOOK_DIR.parents[1].joinpath("src"))
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

import helpers

In [None]:
N_TRIALS = 100

N_TRIALS

In [None]:
UNIQUE_ID = os.getenv("UNIQUE_ID")
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")

UNIQUE_ID, TASK_ID

In [None]:
DEBUG = TASK_ID is None

if DEBUG:
    UNIQUE_ID = "40bd478e"
    N_TRIALS = 2
else:
    assert UNIQUE_ID is not None

UNIQUE_ID, N_TRIALS, DEBUG

In [None]:
NOTEBOOK_DIR.joinpath(UNIQUE_ID).mkdir(parents=True, exist_ok=True)

In [None]:
optuna_db_file = NOTEBOOK_DIR.joinpath(UNIQUE_ID, "optuna.db").resolve()

optuna_db_file

## Load data

### Results

In [None]:
input_file_1 = NOTEBOOK_DIR.parent.joinpath(
    "37_cagi6_sherloc_combine_results", "combined-results.parquet"
)

input_file_2 = NOTEBOOK_DIR.parent.joinpath(
    "37_humsavar_combine_results", "combined-results.parquet"
)

input_file_1, input_file_2

In [None]:
result_1_df = pq.read_table(input_file_1).to_pandas()
result_2_df = pq.read_table(input_file_2).to_pandas()

In [None]:
result_1_df["protein_id"].isnull().sum()

In [None]:
result_1_df = result_1_df.dropna(subset=["protein_id"])

In [None]:
common_columns = set(result_1_df) & set(result_2_df)
mismatched_columns = set(result_1_df) ^ set(result_2_df)

len(common_columns), len(mismatched_columns)

In [None]:
mismatched_columns

In [None]:
result_df = pd.concat(
    [result_1_df[common_columns], result_2_df[common_columns]],
    ignore_index=True,
)
del result_1_df, result_2_df

display(result_df.head(2))
print(len(result_df))

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "US": 0,
    "Likely benign": -1,
    "Benign": -1,
    "LB/B": -1,
    "Likely pathogenic": 1,
    "Pathogenic": 1,
    "LP/P": 1,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [None]:
result_df["effect_score"].value_counts()

### Features

In [None]:
with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "scalar-features.json"
).open("rb") as fin:
    scalar_features = json.load(fin)

with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "vector-features.json"
).open("rb") as fin:
    vector_features = json.load(fin)

with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "column-group-map.json"
).open("rb") as fin:
    column_group_map = json.load(fin)

len(scalar_features), len(vector_features), len(column_group_map)

### Stats

In [None]:
with NOTEBOOK_DIR.parent.joinpath(
    "38_cagi6_sherloc_train_model", UNIQUE_ID, "feature-elimination-stats.pickle"
).open("rb") as fin:
    feature_elimination_stats = pickle.load(fin)

### Training dataframe

In [None]:
training_df = (
    result_df.dropna(
        subset=scalar_features
        + vector_features
        + [
            "effect_score",
        ]
    )
    .drop_duplicates(subset=["protein_id", "mutation"])
    .sort_values("protein_id")
    .reset_index(drop=True)
)
training_df = training_df[training_df["effect_score"] != 0]

In [None]:
training_df, column_group_map, expanded_vector_features = helpers.expand_arrays(
    training_df, vector_features
)

In [None]:
# row = next(training_df[vector_features].itertuples(index=False))._asdict()

# column_group_map = {}
# expanded_vector_features = []
# for column, data in row.items():
#     new_columns = [f"{column}_{i}" for i in range(len(data))]
#     training_df[new_columns] = np.vstack(training_df[column].values)
#     del training_df[column]
#     column_group_map |= {nc: column for nc in new_columns}
#     expanded_vector_features += new_columns

In [None]:
# training_df = training_df.copy()

In [None]:
value_counts = training_df["protein_id"].value_counts()
groups = training_df["protein_id"].drop_duplicates().map(value_counts).values

protein_ids = training_df["protein_id"]

In [None]:
training_df["effect_score"].value_counts()

In [None]:
len(training_df)

## Find best row

In [None]:
feature_elimination_df = pd.DataFrame(
    feature_elimination_stats,
    columns=[
        "round",
        "spearman_r",
        "auc",
        "precision",
        "num_present_features",
        "present_features",
    ],
)

In [None]:
df = feature_elimination_df.iloc[:-24]

plt.plot(df["round"], df["precision"])

In [None]:
plt.plot(df["num_present_features"], df["precision"])

In [None]:
best_row_idx = np.argmax(feature_elimination_df["precision"])

best_row = feature_elimination_df.iloc[best_row_idx]

best_row

In [None]:
# sorted(best_row["present_features"])

## Load best models

In [None]:
X = training_df[best_row["present_features"]].values
y = (training_df["effect_score"] > 0).values.astype(int)

In [None]:
checkpoint_intervals = np.array([3500, 2000, 1000, 500])

checkpoint_to_load = checkpoint_intervals[(checkpoint_intervals - best_row["num_present_features"]) >= 0][-1]

checkpoint_to_load, best_row["num_present_features"], checkpoint_intervals

In [None]:
try:
    with NOTEBOOK_DIR.parent.joinpath(
        "38_cagi6_sherloc_train_model", UNIQUE_ID, f"best-parameters-{checkpoint_to_load}.json"
    ).open("rb") as fin:
        best_parameters = json.load(fin)
except FileNotFoundError as e:
    print(e)
    with NOTEBOOK_DIR.parent.joinpath(
        "38_cagi6_sherloc_train_model", UNIQUE_ID, "best-parameters-starting.json"
    ).open("rb") as fin:
        best_parameters = json.load(fin)

best_parameters

In [None]:
if best_parameters["lambda_l1"] < 1e-8:
    best_parameters["lambda_l1"] = 1e-8

if best_parameters["lambda_l2"] < 1e-8:
    best_parameters["lambda_l2"] = 1e-8

## Tune hyperparameters

In [None]:
CONST_PARAM = {
    "objective": "binary",
    #     "metric": "binary_logloss",
    "metric": "average_precision",
    # "is_unbalance": True,
}

In [None]:
def training_loop(X, y, groups, param, n_splits=6, progressbar=False):
    models = []
    preds = np.ones(len(y), dtype=np.float64) * np.nan
    gkf = model_selection.GroupKFold(n_splits=n_splits)
    for train_index, test_index in tqdm(
        gkf.split(X, y, groups=groups),
        total=n_splits,
        disable=not progressbar,
    ):
        X_training, X_testing = X[train_index], X[test_index]
        y_training, y_testing = y[train_index], y[test_index]
        # weights_training, weights_testing = weights[train_index], weights[test_index]

        dtrain = lgb.Dataset(
            X_training,
            label=y_training,
            # weight=weights_training,
        )
        model = lgb.train(param, dtrain, num_boost_round=1000)
        preds[test_index] = model.predict(X_testing)
        models.append(model)
    return models, preds

In [None]:
def objective(trial, X, y):
    param = CONST_PARAM | {
        "verbosity": -1,
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 300),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.0, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.0, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 7),
        "min_data_in_leaf": trial.suggest_int(
            "min_data_in_leaf", 5, 200
        ),  # aka: min_child_samples
    }
    models, preds = training_loop(X, y, protein_ids, param)
    pred_labels = np.rint(preds)
    accuracy = metrics.accuracy_score(y, pred_labels)
    auc = metrics.roc_auc_score(y, preds)
    precision = metrics.average_precision_score(y, preds)
    return precision

In [None]:
study = optuna.create_study(
    storage=f"sqlite:///{optuna_db_file}",
    study_name=UNIQUE_ID,
    direction="maximize",
    load_if_exists=True,
)
study.enqueue_trial(best_parameters)
study.enqueue_trial(
    {
        "lambda_l1": 0.004151913200216491,
        "lambda_l2": 2.5281035276022037e-05,
        "num_leaves": 227,
        "feature_fraction": 0.46198848429912065,
        "bagging_fraction": 0.899175197391076,
        "bagging_freq": 1,
        "min_data_in_leaf": 161,
    }
)
study.optimize(lambda trial: objective(trial, X, y), n_trials=N_TRIALS)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
final_best_parameters = study.best_trial.params

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

## Validation 

In [None]:
models, preds = training_loop(
    X, y, protein_ids, CONST_PARAM | final_best_parameters, progressbar=True
)

corr = stats.spearmanr(y, preds)
auc = metrics.roc_auc_score(y, preds)
precision = metrics.average_precision_score(y, preds)
print(corr[0], auc, precision)

## Save best parameters and model

In [None]:
if not DEBUG:
    with NOTEBOOK_DIR.joinpath(UNIQUE_ID, "best-parameters.json").open("wt") as fout:
        json.dump(final_best_parameters, fout)

    with NOTEBOOK_DIR.joinpath(UNIQUE_ID, "best-features.json").open("wt") as fout:
        json.dump(best_row["present_features"], fout)

    for model_idx, model in enumerate(models):
        model.save_model(
            str(NOTEBOOK_DIR.joinpath(UNIQUE_ID, f"best-model-{model_idx}.txt"))
        )