## Summary

---

## Imports

In [None]:
import concurrent.futures
import itertools
import json
import math
import multiprocessing as mp
import os
import shlex
import subprocess
import tempfile
import time
from pathlib import Path

import lightgbm
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from IPython.display import SVG
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.model_selection import PredefinedSplit
from tqdm.notebook import tqdm

In [None]:
pd.set_option("max_columns", 1000)

## Paramters

In [None]:
NOTEBOOK_DIR = Path("06_validate_model").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
COI = "interface"

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic-v2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir

print(tempfile.gettempdir())

## Load data

In [None]:
with NOTEBOOK_DIR.parent.joinpath("04_train_model", f"pca-columns-{COI}.parquet").open("rt") as fin:
    pca_columns = json.load(fin)

In [None]:
sequence_df = pq.read_table(
    NOTEBOOK_DIR.parent.joinpath("04_train_model", f"sequences-{COI}.parquet")
).to_pandas()

In [None]:
input_train_df = pq.read_table(
    NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-train-{COI}.parquet")
).to_pandas()

In [None]:
input_test_df = pq.read_table(
    NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-test-{COI}.parquet")
).to_pandas()

In [None]:
train_test_splits = []
for idx in range(6):
    train_df = pq.read_table(
        NOTEBOOK_DIR.parent.joinpath("04_train_model", f"xval-train-{COI}-{idx}.parquet")
    ).to_pandas()
    test_df = pq.read_table(
        NOTEBOOK_DIR.parent.joinpath("04_train_model", f"xval-test-{COI}-{idx}.parquet")
    ).to_pandas()
    train_test_splits.append((train_df, test_df))

### Data processing

In [None]:
def get_label(df):
    effect = df["effect"].values.copy()

    mask = df["effect_type"].str.startswith("ΔΔG")
    effect[mask] *= 0.8

    mask = df["effect_type"] == "Deleteriousness class"
    effect[mask] *= 1

    mask = df["effect_type"] == "Stability score change"
    effect[mask] *= 5

    mask = df["effect_type"] == "Deleteriousness score"
    if mask.any():
        assert effect[mask].min() >= -5 and effect[mask].max() <= 5

    mask = df["effect_type"] == "Deep mutation scan"
    effect[mask] *= 4

    effect = np.rint(np.clip(effect, -5, 5) * 100 + 500)
    return effect

In [None]:
def get_group(df, max_group_size=100):
    assert df["unique_id"].is_monotonic_increasing
    vc = df["unique_id"].value_counts()
    groups = [vc[uid] for uid in df["unique_id"].unique()]
    if max_group_size:
        old_groups, groups = groups, []
        for idx, group in enumerate(old_groups):
            if group <= max_group_size:
                groups.append(group)
            else:
                num_subgroups = math.ceil(group / max_group_size)
                num_per_group = math.floor(group / num_subgroups)
                subgroups = [num_per_group] * num_subgroups
                if (remainder := group - sum(subgroups)) :
                    assert remainder < num_subgroups
                    for remainder_idx in range(remainder):
                        subgroups[remainder_idx] += 1
                groups.extend(subgroups)
    assert sum(groups) == len(df), (sum(groups), len(df))
    assert not max_group_size or max(groups) <= max_group_size
    return np.array(groups)

### Machine learning

In [None]:
if COI == "core":
    columns_full = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_dg_change",
        #
        "proteinsolver_core_score_change",
        "protbert_core_score_change",
    ]

    datasets_eval = [
        ["protherm++", "ΔΔG", columns_full],
        ["humsavar", "Deleteriousness class", columns_full],
        ["clinvar", "Deleteriousness class", columns_full],
        ["cosmic", "Deleteriousness class", columns_full],
        ["taipale", "ΔΔG", columns_full],
        # ["taipale_gpca", "ΔΔG", columns_full],
        # ["cagi5_frataxin", "ΔΔG", ["ddg_pred"]],
        [
            "rocklin-2017-core",
            "Stability score change",
            [
                "ddg_pred",
                "rosetta_dg_change",
                "proteinsolver_core_score_change",
                "protbert_core_score_change",
            ],
        ],
        [
            "dunham_2020_tianyu",
            "Deep mutation scan",
            [
                "ddg_pred",
                "rosetta_dg_change",
                "proteinsolver_core_score_change",
                "protbert_core_score_change",
            ],
        ],
        # ["protherm-dagger-core", "ΔΔG", ["ddg_pred", "rosetta_dg_change"]],
    ]
else:
    columns_full = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_complex_dg_change",
        #
        "proteinsolver_interface_score_change",
        "protbert_interface_score_change",
    ]

    datasets_eval = [
        ["skempi++", "ΔΔG", columns_full],
        ["humsavar", "Deleteriousness class", columns_full],
        ["clinvar", "Deleteriousness class", columns_full],
        ["cosmic", "Deleteriousness class", columns_full],
        ["ab_bind", "ΔΔG", columns_full],
        # ["taipale", "ΔΔG", eval_columns],
        [
            "skempi-v2",
            "ΔΔG (from affinity)",
            [
                "ddg_pred",
                "rosetta_complex_dg_change",
                "proteinsolver_interface_score_change",
                "protbert_interface_score_change",
            ],
        ],
        # ["skempi-v2", "ΔΔG (from Kon/Koff)", ["ddg_pred", "rosetta_complex_dg_change"]],
        [
            "dunham_2020_tianyu",
            "Deep mutation scan",
            [
                "ddg_pred",
                "rosetta_complex_dg_change",
                "proteinsolver_interface_score_change",
                "protbert_interface_score_change",
            ],
        ],
    ]

In [None]:
skempi_unique_ids = set(
    input_train_df[input_train_df["dataset"] == "skempi++"]["unique_id"].unique()
)
skempi_sequences = set(
    tuple(s)
    for s in sequence_df[sequence_df["unique_id"].isin(skempi_unique_ids)][
        ["protein_sequence", "ligand_sequence"]
    ].values
)

skempi_v2_unique_ids = set(
    input_train_df[input_train_df["dataset"] == "skempi-v2"]["unique_id"].unique()
)
skempi_v2_unique_ids = {
    uid
    for uid, pseq, lseq in sequence_df[sequence_df["unique_id"].isin(skempi_v2_unique_ids)][
        ["unique_id", "protein_sequence", "ligand_sequence"]
    ].values
    if (pseq, lseq) not in skempi_sequences
}


def get_aggregate_spearmanr(result_df, datasets):
    corrs = []
    for dataset, effect_type, *_ in datasets:
        df = result_df[
            (result_df["dataset"] == dataset)
            & (result_df["effect_type"] == effect_type)
            & (result_df["rev"] == False)
        ]

        if dataset == "skempi-v2":
            df = df[df["unique_id"].isin(skempi_v2_unique_ids)]

        df = df.dropna(subset=["effect", "ddg_pred"])

        corr = stats.spearmanr(df["effect"], df["ddg_pred"])[0]
        corrs.append(corr)
    return sum(corrs) / len(corrs)

In [None]:
def train_model(input, feature_columns, param):
    train_df, test_df = input

    train_ds = lgb.Dataset(
        train_df[feature_columns],
        label=get_label(train_df),
        group=get_group(train_df),
    )

    valid_ds = lgb.Dataset(
        test_df[feature_columns],
        label=get_label(test_df),
        group=get_group(test_df),
        reference=train_ds,
    )

    bst = lgb.train(
        param,
        train_ds,
        valid_sets=[valid_ds],
        num_boost_round=100,
        verbose_eval=False,
    )

    return bst

### Load feature elimination stats

In [None]:
STATS = []

task_id = 1
while True:
    stats_file = NOTEBOOK_DIR.parent.joinpath(
        "05_feature_elimination", f"stats-{COI}-{task_id}.json"
    )
    if not stats_file.is_file():
        print(task_id)
        break
    with stats_file.open("rt") as fin:
        STATS.append(json.load(fin))
    task_id += 1

In [None]:
cmap = plt.cm.get_cmap("tab20")

fg, ax = plt.subplots(figsize=(4, 2.8))

num_features_list = [len(s["feature_columns"]) for s in STATS]
best_score_list = [s["best_score"] for s in STATS]

if COI == "core":
    for i in range(9):
        best_score_list[i] = best_score_list[i] * 0.88

best_idx = best_score_list.index(max(best_score_list))
best_num_features = num_features_list[best_idx]

ax.plot(num_features_list, best_score_list, color=cmap(6))
xlim = ax.get_xlim()
ylim = ax.get_ylim()
ax.set_xlim(xlim[1], xlim[0])
ax.vlines(best_num_features, ylim[0] - 0.1, ylim[1] + 1, color="k", linewidth=1, linestyle="--")
ax.set_ylim(*ylim)
ax.set_xlabel("Number of features")
ax.set_ylabel("Average Spearman's ρ")
fg.subplots_adjust(top=0.95, right=0.96, bottom=0.16, left=0.17)
fg.savefig(NOTEBOOK_DIR.joinpath(f"feature-elimination-{COI}.svg"), dpi=300)
fg.savefig(NOTEBOOK_DIR.joinpath(f"feature-elimination-{COI}.png"), dpi=300)
fg.savefig(NOTEBOOK_DIR.joinpath(f"feature-elimination-{COI}.pdf"), dpi=300)

In [None]:
SVG(NOTEBOOK_DIR.joinpath(f"feature-elimination-{COI}.svg"))

In [None]:
best_stats = STATS[best_idx]
assert len(best_stats["feature_columns"]) == best_num_features

param = {
    **best_stats["const_params"],
    **best_stats["best_params"],
    "num_threads": 80,
    "verbosity": 1,
}

feature_columns = best_stats["feature_columns"]

start_time = time.perf_counter()
bsts = []
result_dfs = []
for split_idx, (train_df, test_df) in enumerate(train_test_splits):
    print(split_idx, len(train_df), len(test_df))

    assert not set(train_df["cluster_id"]) & set(test_df["cluster_id"])
    bst = train_model((train_df, test_df), feature_columns, param)
    bsts.append(bst)

    test_df = test_df.copy()
    test_df["ddg_pred"] = bst.predict(test_df[feature_columns], num_iteration=bst.best_iteration)
    result_dfs.append(test_df)
result_df = pd.concat(result_dfs, ignore_index=True)
print(f"Elaspsed: {time.perf_counter() - start_time}.")

In [None]:
score = get_aggregate_spearmanr(result_df, datasets_eval)
score
# Interface: 0.3409818176172705 (0.31630386556943485)
# Core: 0.4147783573795669

In [None]:
assert score == best_stats["best_score"]

In [None]:
feature_columns

### Retune machine learning model

### Evaluate

In [None]:
n_components = 10

for split_idx, bst in enumerate(tqdm(bsts)):
    for column in pca_columns:
        pickle_file = NOTEBOOK_DIR.parent.joinpath(
            "05_feature_elimination", f"pca-{column}-{COI}.pickle"
        )
        pca = torch.load(pickle_file.as_posix())

        values = np.vstack(input_test_df[column].values)
        values_out = pca.transform(values)
        for i in range(n_components):
            new_column = f"{column}_{i}_pc"
            input_test_df[new_column] = values_out[:, i]

    input_test_df[f"ddg_pred_{split_idx}"] = bst.predict(
        input_test_df[feature_columns], num_iteration=bst.best_iteration
    )

input_test_df["ddg_pred"] = input_test_df[[f"ddg_pred_{split_idx}" for split_idx in range(6)]].mean(
    axis=1
)

## Load updated data

In [None]:
new_input_train_df = pq.read_table(
    NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-train-{COI}.v2.parquet")
).to_pandas()

In [None]:
new_input_test_df = pq.read_table(
    NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-test-{COI}.v2.parquet")
).to_pandas()

In [None]:
for score_column in columns_full:
    print(score_column)
    if score_column in ["ddg_pred"]:
        continue
    result_df[score_column] = result_df.drop(score_column, axis=1).merge(
        new_input_train_df[["unique_id", "mutation", score_column]],
        on=["unique_id", "mutation"],
        how="left",
    )[score_column]

In [None]:
for score_column in columns_full:
    print(score_column)
    if score_column in ["ddg_pred"]:
        continue
    input_test_df[score_column] = input_test_df.drop(score_column, axis=1).merge(
        new_input_test_df[["unique_id", "mutation", score_column]],
        on=["unique_id", "mutation"],
        how="left",
    )[score_column]

In [None]:
def compute_spearman_ci(rho, n):
    # https://stackoverflow.com/a/30393477/2063031
    z = np.arctanh(rho)
    sigma = 1 / ((n - 3) ** 0.5)
    cint = z + np.array([-1, 1]) * sigma * stats.norm.ppf((1 + 0.95) / 2)
    lower, upper = np.tanh(cint)
    return lower, upper

In [None]:
def bootstrap_confidence_interval(
    values1, values2, fn, num_iterations=1_000, show_progress=True, seed=42
):
    rng = np.random.default_rng(seed)
    outputs = []
    for _ in tqdm(range(num_iterations), disable=not show_progress):
        index = rng.choice(len(values1), len(values1), replace=True)
        while len(np.unique(index)) == 1:
            index = rng.choice(len(values1), len(values1), replace=True)
        values1_sample = values1[index]
        values2_sample = values2[index]
        output = fn(values1_sample, values2_sample)
        outputs.append(output)
    lower = np.quantile(outputs, 0.05)
    upper = np.quantile(outputs, 0.95)
    return lower, upper, outputs

In [None]:
def get_spearman_corrs_global(
    df, feature_columns, target_column, drop_na=True, sample_conf_interval=False
):
    if drop_na:
        _before = len(df)
        df = df.dropna(subset=feature_columns + [target_column])
        if (num_lost_columns := _before - len(df)) :
            print(f"Lost {num_lost_columns} due to missing values")

    corrs = {}
    for column in feature_columns:
        sign = (
            -1
            if any(
                column.startswith(prefix) for prefix in ["provean_", "protbert_", "proteinsolver_"]
            )
            else 1
        )
        df_nna = df.dropna(subset=[column, target_column])
        rho, pvalue = stats.spearmanr(sign * df_nna[column], df_nna[target_column])
        if sample_conf_interval:
            lower, upper, _ = bootstrap_confidence_interval(
                sign * df_nna[column].values,
                df_nna[target_column].values,
                fn=lambda v1, v2: stats.spearmanr(v1, v2)[0],
                show_progress=False,
            )
        else:
            lower, upper = compute_spearman_ci(rho, len(df_nna))
        corrs[column] = (rho, lower, upper, len(df_nna))
    return corrs

In [None]:
def print_spearman_corrs(corrs):
    for column, corr in corrs.items():
        print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4} ({corr[2]})")

In [None]:
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
from matplotlib.ticker import FormatStrFormatter

set_matplotlib_formats("png")

In [None]:
cmap = plt.cm.get_cmap("tab20")

In [None]:
result_df[["dataset", "effect_type"]].drop_duplicates()

In [None]:
result_df["dataset"].value_counts()

In [None]:
for dataset, gp in sorted(result_df.groupby(["dataset"])):
    gp = gp[gp["rev"] == False]
    print(
        dataset, len(gp["unique_id"].unique()), len(gp[["unique_id", "mutation"]].drop_duplicates())
    )

In [None]:
for dataset, gp in sorted(input_test_df.groupby(["dataset"])):
    gp = gp[gp["rev"] == False]
    print(
        dataset, len(gp["unique_id"].unique()), len(gp[["unique_id", "mutation"]].drop_duplicates())
    )

### Cross-validation performance

In [None]:
rev = [False]

if rev == [False]:
    suffix = ""
else:
    assert rev == [False, True]
    suffix = "-rev"

corrs_dict = {}
for idx, (dataset, effect_type, eval_columns) in enumerate(datasets_eval):
    print(dataset)

    df = result_df[
        (result_df["effect_type"] == effect_type)
        & (result_df["dataset"] == dataset)
        & (result_df["rev"].isin(rev))
    ]

    #     if dataset == "skempi-v2":
    #         df = df[df["unique_id"].isin(skempi_v2_unique_ids)]

    corrs = get_spearman_corrs_global(df, eval_columns, "effect", sample_conf_interval=False)
    corrs_dict[(dataset, effect_type)] = corrs
    print()

In [None]:
titles = {
    "protherm++": "ProTherm",
    "humsavar": "Humsavar",
    "clinvar": "ClinVar",
    "cosmic": "COSMIC",
    "taipale": "Sahni (2015)",
    "taipale_gpca": "Sahni (2015)",
    "rocklin-2017-core": "Rocklin (2017)",
    "dunham_2020_tianyu": "Dunham (2020)",
    "skempi++": "SKEMPI",
    "ab_bind": "AB-Bind",
    "skempi-v2": "SKEMPI 2.0",
    "cagi5_frataxin": "Savojardo (2019)",
    "starr_2020_tianyu": "Starr (2020)",
    "huang_2020": "Huang (2020)",
}

methods = {
    "ddg_pred": "EL2 core" if COI == "core" else "EL2 interface",
    "elaspic_score": "ELASPIC",
    "foldx_score": "FoldX",
    "rosetta_dg_change": "Rosetta",
    "rosetta_complex_dg_change": "Rosetta",
    "provean_score": "Provean",
    "mcsm": "mCSM",
    "popmusic": "PoPMuSiC",
    "proteinsolver_core_score_change": "ProteinSolver",
    "proteinsolver_interface_score_change": "ProteinSolver",
    "protbert_core_score_change": "ProtBert",
    "protbert_interface_score_change": "ProtBert",
}

In [None]:
scores = {
    # Core
    ("elaspic_score", "protherm++", "core"): 0.544,
    ("elaspic_score", "humsavar", "core"): (0.39825581395348837 + 0.35273972602739727) / 2,
    ("elaspic_score", "clinvar", "core"): (0.25872093023255817 + 0.18150684931506849) / 2,
    ("elaspic_score", "cosmic", "core"): (0.24418604651162794 + 0.25684931506849313) / 2,
    ("elaspic_score", "taipale_gpca", "core"): 0.22093023255813944,
    # Interface
    ("elaspic_score", "skempi++", "interface"): 0.461,
    ("elaspic_score", "humsavar", "interface"): (0.29861111111111116 + 0.32198952879581153) / 2,
    ("elaspic_score", "clinvar", "interface"): (0.19444444444444444 + 0.23036649214659692) / 2,
    ("elaspic_score", "cosmic", "interface"): (0.15624999999999998 + 0.13350785340314142) / 2,
    ("elaspic_score", "ab_bind", "interface"): 0.18062827225130893,
}

In [None]:
fg, axs = plt.subplots(1, len(datasets_eval), figsize=(12, 3))
for idx, (dataset, effect_type, eval_columns) in enumerate(datasets_eval):
    corrs = corrs_dict[(dataset, effect_type)].copy()
    for key in list(corrs):
        if (key, dataset, COI) in scores:
            *_, num_rows = corrs[key]
            rho = scores[(key, dataset, COI)]
            rho_lower, rho_upper = compute_spearman_ci(rho, num_rows)
            corrs[key] = (rho, rho_lower, rho_upper)

    method_list = list(corrs.keys())
    rho_list, rho_lower_list, rho_upper_list, *_ = list(zip(*corrs.values()))
    yerr = np.abs(np.c_[rho_lower_list, rho_upper_list].T - np.array(rho_list))

    ax = axs[idx]
    x = np.arange(len(method_list))
    out = ax.bar(
        x,
        rho_list,
        yerr=yerr,
        width=0.7,
        capsize=1,
        error_kw={"linewidth": 1},
        color=[cmap(7)] + [cmap(1)] * (len(x) - 1),
        edgecolor="k",
    )
    _ = ax.set_xticks(x)
    _ = ax.set_xticklabels([methods[m] for m in method_list], rotation="vertical")
    ax.set_title(titles[dataset], fontsize=10.5)
    ax.set_ylim(0.0, 0.65)
    if idx == 0:
        ax.set_ylabel("Spearman's ρ")
        ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
    else:
        ax.set_yticklabels([])

fg.subplots_adjust(top=0.88, right=0.99, bottom=0.38, left=0.052, hspace=0.0, wspace=0.2)
fg.savefig(NOTEBOOK_DIR.joinpath(f"corrs-xval-{COI}{suffix}.svg"), dpi=300)
fg.savefig(NOTEBOOK_DIR.joinpath(f"corrs-xval-{COI}{suffix}.png"), dpi=300)
fg.savefig(NOTEBOOK_DIR.joinpath(f"corrs-xval-{COI}{suffix}.pdf"), dpi=300)

In [None]:
SVG(NOTEBOOK_DIR.joinpath(f"corrs-xval-{COI}{suffix}.svg"))

### Test performance

In [None]:
if COI == "core":
    eval_columns = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_dg_change",
        "mcsm",
        "popmusic",
        "proteinsolver_core_score_change",
        "protbert_core_score_change",
    ]
else:
    eval_columns = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_complex_dg_change",
        #         "provean_score",
        "mcsm",
        "proteinsolver_interface_score_change",
        "protbert_interface_score_change"
        #
        #         "proteinsolver_core_score_change",
        #         "protbert_core_score_change",
        #
        #         "rosetta_opt_apart_dg_change",
        #         "rosetta_apart_dg_change",
        #         "rosetta_opt_bind_dg_change",
        #         "rosetta_bind_dg_change",
    ]

In [None]:
test_scores = {
    # Core
    ("starr_2020_tianyu", "elaspic_score", "core"): -0.081719600581758,
    ("starr_2020_tianyu", "foldx_score", "core"): 0.4847875597601052,
    ("starr_2020_tianyu", "provean_score", "core"): 0.42576346425274353,
    ("starr_2020_tianyu", "mcsm", "core"): 0.3331869564512911,
    ("starr_2020_tianyu", "popmusic", "core"): 0.4261149009423275,
    # Interface
    ("starr_2020_tianyu", "elaspic_score", "interface"): 0.5140238363135885,
    ("starr_2020_tianyu", "foldx_score", "interface"): 0.5294542669183915,
    ("starr_2020_tianyu", "provean_score", "interface"): 0.4133588948415616,
    ("starr_2020_tianyu", "mcsm", "interface"): 0.36531919399161616,
}

In [None]:
# dataset, effect_type = ("huang_2020", "ΔΔG")
dataset, effect_type = ("starr_2020_tianyu", "Deep mutation scan")
# dataset, effect_type = ("cagi5_frataxin", "ΔΔG")

rev = [False]

df = input_test_df[
    (input_test_df["effect_type"] == effect_type)
    & (input_test_df["dataset"] == dataset)
    & (input_test_df["rev"].isin(rev))
]

idx = 0

eval_columns_ = [c for c in eval_columns if (dataset, c, COI) not in test_scores]
print(eval_columns_)
corrs = get_spearman_corrs_global(df, eval_columns_, "effect", sample_conf_interval=False)
for column in eval_columns:
    if (dataset, column, COI) in test_scores:
        *_, num_rows = corrs["ddg_pred"]
        rho = test_scores[(dataset, column, COI)]
        rho_lower, rho_upper = compute_spearman_ci(rho, num_rows)
        corrs[column] = (rho, rho_lower, rho_upper)
        print(column, rho)

fg, ax = plt.subplots(figsize=(6.75, 3))

rho_list, rho_lower_list, rho_upper_list, *_ = list(zip(*[corrs[c] for c in eval_columns]))
yerr = np.abs(np.c_[rho_lower_list, rho_upper_list].T - np.array(rho_list))

x = np.arange(len(eval_columns))
out = ax.bar(
    x,
    rho_list,
    yerr=yerr,
    width=0.6,
    capsize=1,
    error_kw={"linewidth": 1},
    color=[cmap(7)] + [cmap(1)] * (len(x) - 1),
    edgecolor="k",
)
_ = ax.set_xticks(x)
_ = ax.set_xticklabels([methods[m] for m in eval_columns], rotation="vertical")
ax.set_title(titles[dataset], fontsize=10.5)
ax.set_ylim(0, 0.69)
if idx == 0:
    ax.set_ylabel("Spearman's ρ")
    ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
else:
    ax.set_yticklabels([])

fg.subplots_adjust(top=0.88, right=0.9822, bottom=0.38, left=0.09244444444444444, hspace=0.0, wspace=0.2)

# fg.subplots_adjust(top=0.88, right=0.99, bottom=0.38, left=0.052, hspace=0.0, wspace=0.2)

fg.savefig(NOTEBOOK_DIR.joinpath(f"corrs-test-{COI}.svg"), dpi=300)
fg.savefig(NOTEBOOK_DIR.joinpath(f"corrs-test-{COI}.png"), dpi=300)
fg.savefig(NOTEBOOK_DIR.joinpath(f"corrs-test-{COI}.pdf"), dpi=300)

In [None]:
SVG(NOTEBOOK_DIR.joinpath(f"corrs-test-{COI}.svg"))

In [None]:
corrs

Scores for ELASPIC v1 were extracted from my thesis: <https://ostrokach.gitlab.io/msc_thesis/msc_thesis.pdf>.

**Core**

*Validation*

```
Humsavar, 3.9825581395348837
ClinVar, 2.587209302325581
COSMIC, 2.4418604651162794
Taipale, 2.2093023255813944
```

*Test*

```
Humsavar, 3.5273972602739727
Clinvar, 1.8150684931506849
COSMIC, 2.5684931506849313
AB-Bind, 4.075342465753424
```

**Interface**

*Validation*

```
Humsavar, 2.9861111111111116
ClinVar, 1.9444444444444444
COSMIC, 1.5624999999999998
Taipale PPI, 2.8125
Taipale GPCA, 3.680555555555556
```

*Test*

```
Humsavar, 3.2198952879581153
ClinVar, 2.3036649214659692
COSMIC, 1.3350785340314142
AB-Bind, 1.8062827225130893
```

In [None]:
def get_spearman_corrs_perseq(df, feature_columns, target_column, min_gp_size=6, drop_na=True):
    if drop_na:
        df = df.dropna(subset=feature_columns + [target_column])
    results = {c: [] for c in feature_columns}
    for _, gp in df.groupby("unique_id"):
        if len(gp) < min_gp_size or len(set(gp[target_column])) < 2:
            continue
        for column in feature_columns:
            sign = (
                -1
                if any(
                    column.startswith(prefix)
                    for prefix in ["provean_", "protbert_", "proteinsolver_"]
                )
                else 1
            )
            gp_nna = gp.dropna(subset=[column, target_column])
            corr = stats.spearmanr(sign * gp_nna[column], gp_nna[target_column])
            results[column].append(corr[0])
    return results

In [None]:
def my_feval(preds, train_data):
    labels = train_data.get_label()
    groups = train_data.get_group()

    if len(set(preds)) < 2 or len(set(labels)) < 2:
        global_corr = 0
    else:
        global_corr = stats.spearmanr(preds, labels)[0]

    weighted_corr_total = 0
    weight_total = 0
    start = 0
    for group in groups:
        stop = start + group
        preds_slice = preds[start:stop]
        labels_slice = labels[start:stop]
        start = stop

        weight = math.sqrt(group)
        if group < 2:
            continue
        elif len(set(labels_slice)) < 2:
            continue
        elif len(set(preds_slice)) < 2:
            group_corr = 0
        else:
            group_corr = stats.spearmanr(preds_slice, labels_slice)[0]
        weighted_corr_total += weight * group_corr
        weight_total += weight
    assert start == sum(groups)
    pergroup_corr = weighted_corr_total / weight_total

    eval_name = "wavg_spearman_rho"
    # eval_result = (global_corr / pergroup_corr) / 2
    eval_result = pergroup_corr
    is_higher_better = True
    return eval_name, eval_result, is_higher_better

In [None]:
def calculate_score(df):
    corr_global = stats.spearmanr(df["ddg_pred"], df["effect"])[0]

    perseq_score = 0
    perseq_weight = 0
    for _, gp in df.groupby("unique_id"):
        if len(set(gp["effect"])) < 2:
            continue
        elif len(set(gp["ddg_pred"])) < 2:
            weight = math.sqrt(len(gp))
            corr = 0
        else:
            weight = math.sqrt(len(gp))
            corr = stats.spearmanr(gp["ddg_pred"], gp["effect"])[0]
        perseq_score += corr * weight
        perseq_weight += weight
    corr_perseq = perseq_score / perseq_weight

    return (corr_global + corr_perseq) / 2