## Summary

```bash
sbatch --array=1-46%1 --time=3:00:00 --export=NOTEBOOK_PATH="$(realpath 05_feature_elimination.ipynb)",COI=core ../scripts/run_notebook_cpu.sh

sbatch --array=1-136%1 --time=3:00:00 --export=NOTEBOOK_PATH="$(realpath 05_feature_elimination.ipynb)",COI=interface ../scripts/run_notebook_cpu.sh
```

---

## Imports

In [None]:
import os
import shlex
import subprocess
import tempfile
from pathlib import Path
import optuna
import concurrent.futures
import itertools
import lightgbm
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import json
import socket

import math
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.model_selection import PredefinedSplit
from tqdm.notebook import tqdm
import multiprocessing as mp

In [None]:
pd.set_option("max_columns", 1000)

## Paramters

In [None]:
NOTEBOOK_DIR = Path("05_feature_elimination").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
COI = os.getenv("COI")
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

TASK_ID, TASK_COUNT

In [None]:
DEBUG = TASK_ID is None

if DEBUG:
    COI = "interface"
    TASK_ID = 1
    TASK_COUNT = 136
else:
    assert COI in ["core", "interface"]
    assert TASK_ID is not None
    assert TASK_COUNT is not None

COI, TASK_ID, TASK_COUNT

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic-v2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

CPU_COUNT

In [None]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir

print(tempfile.gettempdir())

## Load data

In [None]:
with NOTEBOOK_DIR.parent.joinpath("04_train_model", f"pca-columns-{COI}.parquet").open("rt") as fin:
    pca_columns = json.load(fin)

In [None]:
sequence_df = pq.read_table(NOTEBOOK_DIR.parent.joinpath("04_train_model", f"sequences-{COI}.parquet")).to_pandas()

In [None]:
input_train_df = pq.read_table(NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-train-{COI}.parquet")).to_pandas()

In [None]:
input_test_df = pq.read_table(NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-test-{COI}.parquet")).to_pandas()

In [None]:
train_test_splits = []
for idx in range(6):
    train_df = pq.read_table(NOTEBOOK_DIR.parent.joinpath("04_train_model", f"xval-train-{COI}-{idx}.parquet")).to_pandas()
    test_df = pq.read_table(NOTEBOOK_DIR.parent.joinpath("04_train_model", f"xval-test-{COI}-{idx}.parquet")).to_pandas()
    train_test_splits.append((train_df, test_df))

### Optimize labels

In [None]:
# feature_columns = [
#     c
#     for c in list(train_test_splits[0][0])
#     if (c.endswith("_wt") or c.endswith("_mut") or c.endswith("_change") or c.endswith("_pc"))
#     and not (c.endswith("dg_change") or c.startswith("rosetta_"))
# ]

with open(NOTEBOOK_DIR.joinpath(f"feature-columns-{COI}-{TASK_ID - 1}.json"), "rt") as fin:
    feature_columns = json.load(fin)

In [None]:
def get_label(df):
    effect = df["effect"].values.copy()

    mask = df["effect_type"].str.startswith("ΔΔG")
    effect[mask] *= 0.8

    mask = df["effect_type"] == "Deleteriousness class"
    effect[mask] *= 1

    mask = df["effect_type"] == "Stability score change"
    effect[mask] *= 5

    mask = df["effect_type"] == "Deleteriousness score"
    if mask.any():
        assert effect[mask].min() >= -5 and effect[mask].max() <= 5

    mask = df["effect_type"] == "Deep mutation scan"
    effect[mask] *= 4

    effect = np.rint(np.clip(effect, -5, 5) * 100 + 500)
    return effect

In [None]:
input_train_df["effect_type"].value_counts()

### Optimize groups

In [None]:
def get_group(df, max_group_size=100):
    assert df["unique_id"].is_monotonic_increasing
    vc = df["unique_id"].value_counts()
    groups = [vc[uid] for uid in df["unique_id"].unique()]
    if max_group_size:
        old_groups, groups = groups, []
        for idx, group in enumerate(old_groups):
            if group <= max_group_size:
                groups.append(group)
            else:
                num_subgroups = math.ceil(group / max_group_size)
                num_per_group = math.floor(group / num_subgroups)
                subgroups = [num_per_group] * num_subgroups
                if (remainder := group - sum(subgroups)):
                    assert remainder < num_subgroups
                    for remainder_idx in range(remainder):
                        subgroups[remainder_idx] += 1
                groups.extend(subgroups)
    assert sum(groups) == len(df), (sum(groups), len(df))
    assert not max_group_size or max(groups) <= max_group_size
    return np.array(groups)

In [None]:
if COI == "core":
    max_group_size = 100
else:
    max_group_size = 100

### Train model

In [None]:
def train_model(input, feature_columns, param):
    train_df, test_df = input

    train_ds = lgb.Dataset(
        train_df[feature_columns],
        label=get_label(train_df),
        group=get_group(train_df, max_group_size=max_group_size),
    )

    valid_ds = lgb.Dataset(
        test_df[feature_columns],
        label=get_label(test_df),
        group=get_group(test_df, max_group_size=max_group_size),
        reference=train_ds,
    )

    bst = lgb.train(
        param,
        train_ds,
        valid_sets=[valid_ds],
        num_boost_round=100,
        verbose_eval=False,
    )

    return bst

In [None]:
skempi_unique_ids = set(input_train_df[input_train_df["dataset"] == "skempi++"]["unique_id"].unique())
skempi_sequences = set(tuple(s) for s in sequence_df[sequence_df["unique_id"].isin(skempi_unique_ids)][["protein_sequence", "ligand_sequence"]].values)

skempi_v2_unique_ids = set(input_train_df[input_train_df["dataset"] == "skempi-v2"]["unique_id"].unique())
skempi_v2_unique_ids = {
    uid for uid, pseq, lseq
    in sequence_df[sequence_df["unique_id"].isin(skempi_v2_unique_ids)][["unique_id", "protein_sequence", "ligand_sequence"]].values
    if (pseq, lseq) not in skempi_sequences
}


def get_aggregate_spearmanr(result_df, datasets):
    corrs = []
    for dataset, effect_type, *_ in datasets:
        df = result_df[
            (result_df["dataset"] == dataset)
            & (result_df["effect_type"] == effect_type)
            & (result_df["rev"] == False)
        ]

        if dataset == "skempi-v2":
            df = df[df["unique_id"].isin(skempi_v2_unique_ids)]

        df = df.dropna(subset=["effect", "ddg_pred"])
        
        corr = stats.spearmanr(df["effect"], df["ddg_pred"])[0]
        corrs.append(corr)
    return sum(corrs) / len(corrs)

In [None]:
if COI == "core":
    columns_full = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_dg_change",
    ]

    datasets_eval = [
        ["protherm++", "ΔΔG", columns_full],
        ["humsavar", "Deleteriousness class", columns_full],
        ["clinvar", "Deleteriousness class", columns_full],
        ["cosmic", "Deleteriousness class", columns_full],
        ["taipale", "ΔΔG", columns_full],
        # ["taipale_gpca", "ΔΔG", columns_full],
        # ["cagi5_frataxin", "ΔΔG", ["ddg_pred"]],
        ["rocklin-2017-core", "Stability score change", ["ddg_pred", "rosetta_dg_change"]],
        ["dunham_2020_tianyu", "Deep mutation scan", ["ddg_pred", "rosetta_dg_change"]],
        # ["protherm-dagger-core", "ΔΔG", ["ddg_pred", "rosetta_dg_change"]],
    ]
else:
    columns_full = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_complex_dg_change",
    ]

    datasets_eval = [
        ["skempi++", "ΔΔG", columns_full],
        ["humsavar", "Deleteriousness class", columns_full],
        ["clinvar", "Deleteriousness class", columns_full],
        ["cosmic", "Deleteriousness class", columns_full],
        ["ab_bind", "ΔΔG", ["ddg_pred", "elaspic_score", "foldx_score"]],
        # ["taipale", "ΔΔG", eval_columns],
        ["skempi-v2", "ΔΔG (from affinity)", ["ddg_pred", "rosetta_complex_dg_change"]],
        # ["skempi-v2", "ΔΔG (from Kon/Koff)", ["ddg_pred", "rosetta_complex_dg_change"]],
        ["dunham_2020_tianyu", "Deep mutation scan", ["ddg_pred", "rosetta_complex_dg_change"]],
    ]


In [None]:
const_param = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "verbosity": -1,
    "eval_at": 1_000_000,
    "label_gain": [np.log2(2 + i) for i in range(0, 1_001)],
    "force_col_wise": True,
    "num_threads": 40,
}

In [None]:
def objective(trial):
    param = {
        **const_param,
        # num_trees = 100
#         "learning_rate": trial.suggest_loguniform("lambda_l1", 1e-3, 1.0),
#         "num_iterations": trial.suggest_int("num_leaves", 64, 256),
        "num_leaves": trial.suggest_int("num_leaves", 2, 512),  # 256
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 200), # 100
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
    }

    result_dfs = []
    for train_df, test_df in train_test_splits:
        assert not set(train_df["cluster_id"]) & set(test_df["cluster_id"])
        bst = train_model((train_df, test_df), feature_columns, param)
        
        test_df = test_df.copy()
        test_df["ddg_pred"] = bst.predict(
            test_df[feature_columns], num_iteration=bst.best_iteration
        )
        result_dfs.append(test_df)
    result_df = pd.concat(result_dfs, ignore_index=True)
    
    score = get_aggregate_spearmanr(result_df, datasets_eval)

    return score


start_time = time.perf_counter()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)
print(f"Elaspsed: {time.perf_counter() - start_time}.")

In [None]:
RESULTS = {
    "best_score": study.best_value,
    "const_params": const_param,
    "best_params": study.best_params,
    "feature_columns": feature_columns,
    "feature_elimination_stats": [],
}

In [None]:
param = {
    **const_param,
    **study.best_params,
    "num_threads": 80,
}


for i, feature_to_eliminate in enumerate(feature_columns):
    print(i, feature_to_eliminate, end=" ")
    
    feature_columns_elim = [c for c in feature_columns if c != feature_to_eliminate]
    assert len(feature_columns_elim) == len(feature_columns) - 1

    result_dfs = []
    for train_df, test_df in train_test_splits:
        assert not set(train_df["cluster_id"]) & set(test_df["cluster_id"])
        bst = train_model((train_df, test_df), feature_columns_elim, param)
        
        test_df = test_df.copy()
        test_df["ddg_pred"] = bst.predict(
            test_df[feature_columns_elim], num_iteration=bst.best_iteration
        )
        result_dfs.append(test_df)
    result_df = pd.concat(result_dfs, ignore_index=True)
    
    score = get_aggregate_spearmanr(result_df, datasets_eval)
    print(score)

    RESULTS["feature_elimination_stats"].append([feature_to_eliminate, score])

In [None]:
feature_elimination_stats = (
    pd.DataFrame(RESULTS["feature_elimination_stats"], columns=["feature_name", "score"])
    .sort_values("score", ascending=False)
)

feature_elimination_stats.head()

In [None]:
top_row = feature_elimination_stats.iloc[0]

final_feature_to_eliminate = top_row["feature_name"]
print(final_feature_to_eliminate)

In [None]:
feature_columns_new = [c for c in feature_columns if c != final_feature_to_eliminate]
assert len(feature_columns_new) == len(feature_columns) - 1

feature_columns_new

In [None]:
with open(NOTEBOOK_DIR.joinpath(f"feature-columns-{COI}-{TASK_ID}.json"), "wt") as fout:
    json.dump(feature_columns_new, fout)

In [None]:
with open(NOTEBOOK_DIR.joinpath(f"stats-{COI}-{TASK_ID}.json"), "wt") as fout:
    json.dump(RESULTS, fout)