## Summary

---

## Imports

In [None]:
import os
import shlex
import subprocess
import tempfile
from pathlib import Path

import lightgbm
import lightgbm as lgb
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.model_selection import PredefinedSplit
from tqdm.notebook import tqdm

In [None]:
pd.set_option("max_columns", 1000)

## Paramters

In [None]:
NOTEBOOK_DIR = Path("04_train_model_core_norosetta").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
COI = "core"

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic-v2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir

print(tempfile.gettempdir())

In [None]:
if COI == "core":
    datasets = [
        "elaspic-training-set-core",
        "protherm-dagger-core",
        "rocklin-2017-core",
        "dunham-2020-core",
        "starr-2020-core",
        "cagi5-frataxin-core",
        "huang-2020-core",
    ]
else:
    assert COI == "interface"
    datasets = [
        "elaspic-training-set-interface",
        #     "skempi-v2-interface",
        #     "intact-mutations-interface",
        #     "dunham-2020-interface",
        #     "starr-2020-interface",
    ]

In [None]:
feature_generators = [
    "02_run_rosetta_ddg",
    "02_run_proteinsolver",
    "02_run_protbert",
]

### Load data

In [None]:
def expand_mutations(df):
    results = []
    for row in df.itertuples():
        for idx in range(len(row.mutation)):
            row_mut = {
                "unique_id": row.unique_id,
                "dataset": row.dataset,
                "name": row.name,
                "mutation": row.mutation[idx],
                "effect": row.effect[idx],
                "effect_type": row.effect_type,
            }
            for column in ["provean_score", "foldx_score", "elaspic_score"]:
                if hasattr(row, column):
                    row_mut[column] = getattr(row, column)[idx]
            results.append(row_mut)
    return pd.DataFrame(results)

In [None]:
def add_mutation_complement(df):
    df = df.copy()
    df["rev"] = False

    df_comp = df.copy()
    df_comp["rev"] = True
    df_comp["mutation"] = (
        df_comp["mutation"].str[-1] + df_comp["mutation"].str[1:-1] + df_comp["mutation"].str[0]
    )
    for column in ["effect", "provean_score", "foldx_score", "elaspic_score"]:
        if column in df_comp:
            df_comp[column] = -df_comp[column]
    for column in df_comp:
        if column.endswith("_wt"):
            column_mut = column[:-3] + "_mut"
            df_comp[column], df_comp[column_mut] = (
                df_comp[column_mut].copy(),
                df_comp[column].copy(),
            )

    df_out = pd.concat([df, df_comp], ignore_index=True)
    return df_out

In [None]:
tmp_df = pd.DataFrame(
    [[0, "M1A", 1.234, "wt score", "mut score"], [1, "M2C", -0.05, "wt score 2", "mut score 2"]],
    columns=["unique_id", "mutation", "effect", "feature_wt", "feature_mut"],
)

tmp2_df = add_mutation_complement(tmp_df)

display(tmp_df)
display(tmp2_df)

In [None]:
def merge_feature_dfs(feature_dfs):
    def _clean_df(df):
        df = df.copy()
        assert len(df) == len(df[["unique_id", "mutation"]].drop_duplicates())
        for column in ["effect", "effect_type", "provean_score", "foldx_score", "elaspic_score"]:
            if column in df:
                del df[column]
        return df

    if not feature_dfs:
        return None

    df = _clean_df(feature_dfs[0])
    for other_df in feature_dfs[1:]:
        df = df.merge(
            _clean_df(other_df), how="outer", on=["unique_id", "mutation", "rev"]
        )
    return df

In [None]:
input_data = {}
for dataset_name in datasets:
    input_file = OUTPUT_DIR.joinpath("01_load_data", f"{dataset_name}.parquet")
    pfile = pq.ParquetFile(input_file)
    task_count = pfile.num_row_groups
    df = pfile.read().to_pandas(integer_object_nulls=True)
    expanded_df = (
        add_mutation_complement(expand_mutations(df))
        .drop_duplicates(subset=["unique_id", "mutation"])
        .sort_values(["unique_id", "mutation"])
    )
    sequence_df = df[["unique_id", "protein_sequence", "ligand_sequence"]].drop_duplicates()

    features = {}
    for feature_generator in feature_generators:
        output_dir = OUTPUT_DIR.joinpath(feature_generator)
        feature_dfs = []
        for task_id in range(1, task_count + 1):
            if feature_generator in ["02_run_rosetta_ddg"]:
                # wt → mut
                output_file_wt2mut = output_dir.joinpath(
                    f"{dataset_name}-wt2mut-{task_id}-{task_count}.parquet"
                )
                if not output_file_wt2mut.is_file():
                    print(f"File {output_file_wt2mut} is missing. Skipping...")
                    continue
                feature_wt2mut_df = pq.read_table(output_file_wt2mut).to_pandas(
                    integer_object_nulls=True
                )
                feature_wt2mut_df["rev"] = False
                feature_dfs.append(feature_wt2mut_df)

                # mut → wt
                output_file_mut2wt = output_dir.joinpath(
                    f"{dataset_name}-mut2wt-{task_id}-{task_count}.parquet"
                )
                if not output_file_mut2wt.is_file():
                    print(f"File {output_file_mut2wt} is missing. Skipping...")
                    continue
                feature_mut2wt_df = pq.read_table(output_file_mut2wt).to_pandas(
                    integer_object_nulls=True
                )
                assert feature_mut2wt_df["unique_id"].min() < 0
                feature_mut2wt_df["unique_id"] = -feature_mut2wt_df["unique_id"]
                feature_mut2wt_df["rev"] = True
                feature_dfs.append(feature_mut2wt_df)
            else:
                output_file = output_dir.joinpath(f"{dataset_name}-{task_id}-{task_count}.parquet")
                if not output_file.is_file():
                    print(f"File {output_file} is missing. Skipping...")
                    continue
                feature_df = pq.read_table(output_file).to_pandas(integer_object_nulls=True)
                feature_df = add_mutation_complement(feature_df)
                feature_dfs.append(feature_df)

        if not feature_dfs:
            print(
                f"No data collected for dataset {dataset_name} and feature generator {feature_generator}."
            )
            continue
        feature_df = pd.concat(feature_dfs, ignore_index=True)
        features[feature_generator] = feature_df
    input_data[dataset_name] = {
        "expanded_df": expanded_df,
        "sequence_df": sequence_df,
        "feature_df": merge_feature_dfs(features),
    }

### Merge together

In [None]:
expanded_df = pd.concat(
    [d["expanded_df"] for d in input_data.values() if d["feature_df"] is not None]
)

sequence_df = pd.concat(
    [d["sequence_df"] for d in input_data.values() if d["feature_df"] is not None]
)

features_df = pd.concat(
    [d["feature_df"] for d in input_data.values() if d["feature_df"] is not None]
).sort_values(["unique_id", "mutation"])
assert features_df["unique_id"].min() >= 0
len(features_df)

In [None]:
expanded_df.head()

In [None]:
features_df.head()

In [None]:
features_nn_df = features_df.dropna(subset=[c for c in features_df if not c.startswith("rosetta_")])
print(
    f"Lost {len(features_df) - len(features_nn_df):,} out of {len(features_df):,} rows due to missing values."
)

len(features_nn_df)

In [None]:
input_df = expanded_df.merge(features_nn_df, on=["unique_id", "mutation", "rev"], validate="1:1")
assert len(input_df) == len(features_nn_df)
print(
    f"Lost {len(expanded_df) - len(input_df):,} out of {len(expanded_df):,} rows due to missing features."
)

# Correct the sign on some features
for dataset, effect_type in [
#     ("cagi4_sumo_ligase", "Deleteriousness score"),
#     ("benedix_et_al", "ΔΔG"),
#     ("hiv_escape_mutations", "ΔΔG"),
]:
    mask = (input_df["dataset"] == dataset) & (input_df["effect_type"] == effect_type)
    if mask.any():
        print(dataset, effect_type)
        input_df.loc[mask, "effect"] = -input_df.loc[mask, "effect"]

len(input_df)

In [None]:
if COI == "core":
    datasets_to_drop = {
        "cagi4_sumo_ligase",
        "benedix_et_al",
        "hiv_escape_mutations",
        "ab_bind",
        "skempiskempi",
        "taipale_ppi",
    }

input_df = input_df[~input_df["dataset"].isin(datasets_to_drop)]

len(input_df)

In [None]:
input_df.head()

### Feature engineering

In [None]:
display(input_df.head())
print(len(input_df))

In [None]:
assert not input_df["foldx_score"].isnull().any()
assert not input_df["effect"].isnull().any()

In [None]:
input_df["effect_type"].value_counts()

In [None]:
input_df["dataset"].unique()

In [None]:
pca_columns = []

for column in list(input_df):
    if column.endswith("_mut"):
        print(column)
        column_wt = column[:-4] + "_wt"
        column_change = column[:-4] + "_change"
        value_sample = input_df[column].iloc[0]
        if isinstance(value_sample, (list, np.ndarray)):
            input_df[column_change] = input_df[column] - input_df[column_wt]
            pca_columns.extend([column_wt, column, column_change])
        else:
            input_df[column_change] = input_df[column] - input_df[column_wt]
        # del input_df[column]

pca_columns

In [None]:
for (dataset, effect_type), gp in input_df.groupby(["dataset", "effect_type"]):
    gp = gp.copy()
    gp_sub = gp.dropna(subset=["effect", "protbert_core_score_change"])
    corr1 = stats.spearmanr(gp_sub["effect"], gp_sub["protbert_core_score_change"])
    gp_sub = gp_sub[gp_sub["rev"] == False]
    corr2 = stats.spearmanr(gp_sub["effect"], gp_sub["protbert_core_score_change"])
    if corr1[0] > 0 or corr2[0] > 0:
        print(dataset, effect_type)
        for column in [
            "provean_score",
            "foldx_score",
            "elaspic_score",
            "rosetta_dg_change",
            "protbert_core_score_change",
            "proteinsolver_core_score_change",
        ]:
            gp_sub = gp.dropna(subset=["effect", column])
            corr = stats.spearmanr(gp_sub["effect"], gp_sub[column])
            print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")
            gp_sub = gp_sub[gp_sub["rev"] == False]
            corr = stats.spearmanr(gp_sub["effect"], gp_sub[column])
            print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")
        print()

In [None]:
for (dataset, effect_type), gp in input_df.groupby(["dataset", "effect_type"]):
    gp = gp.dropna(subset=["effect", "provean_score"])
    assert len(gp)
    corr = stats.spearmanr(gp["effect"], gp["provean_score"])
    assert corr[0] <= 0, (dataset, effect_type)

In [None]:
df = input_df[
    (input_df["effect_type"] == "ΔΔG")
    & (input_df["dataset"] == "hiv_escape_mutations")
    & (input_df["rev"] == False)
].dropna()

df = input_df[
#     (input_df["effect_type"] == "Deleteriousness score")
    (input_df["dataset"] == "cagi4_sumo_ligase")
#     & (input_df["dataset"] == "cagi4_sumo_ligase")
    & (input_df["rev"] == False)
].dropna(subset=["effect", "provean_score"])

# df = input_df[(input_df["effect_type"] == "ΔΔG") & (input_df["rev"] == False)]

for column in ["provean_score", "foldx_score", "elaspic_score", "rosetta_dg_change"]:
    corr = stats.spearmanr(df["effect"], df[column])
    print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")

In [None]:
input_df["dataset"].unique()

### Clustering

In [None]:
def obtain_clusters(input_sequences, min_seq_id=0.3):
    with tempfile.TemporaryDirectory() as tmp_dir:
        input_dir = Path(tmp_dir, "input")
        input_dir.mkdir()

        output_dir = Path(tmp_dir, "output")
        output_dir.mkdir()

        scratch_dir = Path(tmp_dir, "scratch")
        scratch_dir.mkdir()

        with input_dir.joinpath("input.fasta").open("wt") as fout:
            for tup in input_sequences.itertuples():
                fout.write(f">{tup.unique_id}\n{tup.protein_sequence}\n")

        system_command = f"mmseqs easy-cluster --min-seq-id {min_seq_id} '{input_dir}/input.fasta' '{output_dir}/result' '{scratch_dir}'"
        print(system_command)

        proc = subprocess.run(shlex.split(system_command), capture_output=True, check=True)

        cluster_df = pd.read_csv(
            output_dir.joinpath("result_cluster.tsv"), sep="\t", names=["cluster_id", "unique_id"]
        )
        assert len(cluster_df) == len(cluster_df["unique_id"].unique())

    return cluster_df

In [None]:
input_sequences = sequence_df.merge(input_df[["unique_id"]].drop_duplicates())

len(input_sequences)

In [None]:
cluster_df = obtain_clusters(input_sequences)

In [None]:
cluster_df.head()

In [None]:
if "cluster_id" in input_df:
    del input_df["cluster_id"]

input_df = input_df.merge(cluster_df, on="unique_id", how="outer", validate="m:1")
assert input_df["cluster_id"].notnull().all()

### Load data

In [None]:
import heapq
from dataclasses import dataclass, field
from typing import Any


def map_to_test_fold(df):
    @dataclass(order=True)
    class PrioritizedItem:
        priority: int
        idx: int = field(compare=False)
        data: Any = field(compare=False)

    ddg_df = df[df["effect_type"] == "ΔΔG"]
    score_df = df[df["effect_type"] == "Deleteriousness score"]
    other_df = df[df["effect_type"] == "Deleteriousness class"]
    assert len(ddg_df) + len(score_df) + len(other_df) == len(df)

    ddg_pq = [PrioritizedItem(0, i, []) for i in range(10)]
    for cluster_id, gp in ddg_df.groupby("cluster_id"):
        item = heapq.heappop(ddg_pq)
        item.priority += len(gp)
        item.data.append(cluster_id)
        heapq.heappush(ddg_pq, item)

    mapping = {}
    for item in ddg_pq:
        for cluster_id in item.data:
            mapping[cluster_id] = item.idx

    del ddg_pq

    score_pq = [PrioritizedItem(0, i, []) for i in range(10)]
    for cluster_id, gp in score_df.groupby("cluster_id"):
        if cluster_id in mapping:
            item_idx = mapping[cluster_id]
            item = next(item for item in score_pq if item.idx == item_idx)
            item.priority += len(gp)
            item.data.append(cluster_id)
            heapq.heapify(score_pq)
        else:
            item = heapq.heappop(score_pq)
            item.priority += len(gp)
            item.data.append(cluster_id)
            heapq.heappush(score_pq, item)

    for item in score_pq:
        for cluster_id in item.data:
            if cluster_id in mapping:
                assert mapping[cluster_id] == item.idx
            else:
                mapping[cluster_id] = item.idx

    del score_pq

    other_pq = [PrioritizedItem(0, i, []) for i in range(10)]
    for cluster_id, gp in other_df.groupby("cluster_id"):
        if cluster_id in mapping:
            item_idx = mapping[cluster_id]
            item = next(item for item in other_pq if item.idx == item_idx)
            item.priority += len(gp)
            item.data.append(cluster_id)
            heapq.heapify(other_pq)
        else:
            item = heapq.heappop(other_pq)
            item.priority += len(gp)
            item.data.append(cluster_id)
            heapq.heappush(other_pq, item)

    for item in other_pq:
        for cluster_id in item.data:
            if cluster_id in mapping:
                assert mapping[cluster_id] == item.idx
            else:
                mapping[cluster_id] = item.idx

    return mapping

In [None]:
cluster_id_to_test_fold_mapping = map_to_test_fold(input_df)
input_df["test_fold"] = input_df["cluster_id"].map(cluster_id_to_test_fold_mapping)
assert input_df["test_fold"].notnull().all()
assert len(input_df["test_fold"].unique()) == 10

In [None]:
def get_label(df):
    effect = df["effect"].values.copy()
    
    mask = df["effect_type"] == "Deleteriousness class"
    assert mask.any()
    effect[mask] *= 3

    mask = df["effect_type"] == "Deleteriousness score"
    if mask.any():
        assert effect[mask].min() >= -5 and effect[mask].max() <= 5

    effect = np.rint(np.clip(effect, -5, 5) * 1000 + 5000)
    return effect

In [None]:
input_df["label"] = get_label(input_df)

In [None]:
def get_group(df):
    vc = df["unique_id"].value_counts()
    groups = np.array([vc[uid] for uid in df["unique_id"].unique()])
    return groups

In [None]:
NOTEBOOK_DIR

In [None]:
n_components = 10

train_test_splits = []

ps = PredefinedSplit(input_df["test_fold"])
for i, (train, test) in enumerate(tqdm(ps.split(), total=n_components)):
    train_df = input_df.iloc[train].copy()
    test_df = input_df.iloc[test].copy()
    assert not set(train_df["cluster_id"]) & set(test_df["cluster_id"])

    for column in pca_columns:
        train_values = np.vstack(train_df[column].values)
        test_values = np.vstack(test_df[column].values)

        pickle_file = NOTEBOOK_DIR.joinpath(f"{column}_pca{i}.pickle")
        if pickle_file.is_file():
            pca = torch.load(pickle_file)
        else:
            pca = PCA(n_components=n_components)
            pca.fit(train_values)
            torch.save(pca, pickle_file)

        train_values_out = pca.transform(train_values)
        test_values_out = pca.transform(test_values)
        for i in range(n_components):
            new_column = f"{column}_{i}_pca"
            train_df[new_column] = train_values_out[:, i]
            test_df[new_column] = test_values_out[:, i]
        del train_df[column], test_df[column]
    train_test_splits.append((train_df, test_df))

In [None]:
feature_columns = [
    c
    for c in list(train_test_splits[0][0])
    if (c.endswith("_wt") or c.endswith("_mut") or c.endswith("_change") or c.endswith("_pca"))
    and not (c.endswith("dg_change") or c.startswith("rosetta_"))
]

# feature_columns = [f for f in feature_columns if not f.startswith("rosetta_")]
feature_columns

In [None]:
other_columns = [c for c in list(input_df) if c not in feature_columns]

other_columns

In [None]:
def cross_validate(train_test_splits, param):
    result_dfs = []
    for train_df, test_df in train_test_splits:
        assert not set(train_df["cluster_id"]) & set(test_df["cluster_id"])

        train_ds = lgb.Dataset(
            train_df[feature_columns],
            label=train_df["label"],
            group=get_group(train_df),
        )

        valid_ds = lgb.Dataset(
            test_df[feature_columns],
            label=test_df["label"],
            group=get_group(test_df),
            reference=train_ds,
        )

        bst = lgb.train(
            param, train_ds, num_boost_round=100, valid_sets=[valid_ds], verbose_eval=10000
        )

        test_df["ddg_pred"] = bst.predict(
            test_df[feature_columns], num_iteration=bst.best_iteration
        )
        result_dfs.append(test_df)
    result_df = pd.concat(result_dfs, ignore_index=True)
    return result_df

In [None]:
import optuna

In [None]:
def objective(trial):
    param = {
        "objective": "lambdarank",
        "metric": "ndcg",
        "verbosity": -1,
        "eval_at": 1_000_000,
        "label_gain": [(np.log2(i + 1) + 1) for i in range(0, 10_001)],
        "force_col_wise": True,
        #
        "num_boost_round": 100,
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    result_df = cross_validate(train_test_splits, param)
    df = result_df[(result_df["dataset"] == "protherm++") & (result_df["rev"] == False)].dropna(
        subset=["ddg_pred", "effect"]
    )
    corr = stats.spearmanr(df["effect"], df["ddg_pred"])[0]
    return corr


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
param = {
    #
    "objective": "lambdarank",
    #     "objective": "rank_xendcg",
    "metric": "ndcg",
    "eval_at": 1_000_000,
    "label_gain": [(np.log2(i + 1) + 1) for i in range(0, 10_001)],
    "force_col_wise": True,
    #
    "max_bin": 255,
    #     "num_trees": 100,  # aka num_boost_round
    "learning_rate": 0.1,
}

result_df = cross_validate(train_test_splits, param)

In [None]:
result_df["provean_score"] = -result_df["provean_score"]

In [None]:
bst.best_score  # 0.9979310001302695

In [None]:
len(input_df)

In [None]:
result_df.head()

In [None]:
if COI == "core":
    eval_columns = [
        "ddg_pred",
        "provean_score",
        "protbert_core_score_change",
        "proteinsolver_core_score_change",
        "foldx_score",
        "elaspic_score",
        "rosetta_dg_change",
    ]
else:
    eval_columns = [
        "ddg_pred",
        "protbert_core_score_change",
        "proteinsolver_core_score_change",
        "provean_score",
        "foldx_score",
        "elaspic_score",
        "rosetta_opt_apart_dg_change",
        "rosetta_apart_dg_change",
        "rosetta_complex_dg_change",
        "rosetta_opt_bind_dg_change",
        "rosetta_bind_dg_change",
    ]

In [None]:
def get_spearman_corrs_global(df, feature_columns, target_column):
    corrs = {}
    for column in feature_columns:
        sub_df = df.dropna(subset=[column, target_column])
        corr = stats.spearmanr(sub_df[column], sub_df[target_column])
        corrs[column] = (corr[0], corr[1], len(sub_df))
        # print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")
    return corrs

In [None]:
def get_spearman_corrs_perseq(df, feature_columns, target_column, min_gp_size=6):
    df = df.dropna(subset=feature_columns + [target_column])
    results = {c: [] for c in feature_columns}
    for _, gp in df.groupby("unique_id"):
        if len(gp) < min_gp_size or len(set(gp[target_column])) < 2:
            continue
        for column in feature_columns:
            corr = stats.spearmanr(gp[column], gp[target_column])
            results[column].append(corr[0])
    return results

In [None]:
def print_spearman_corrs(corrs):
    for column, corr in corrs.items():
        print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4} ({corr[2]})")

In [None]:
print_spearman_corrs(get_spearman_corrs_global(result_df, eval_columns, "effect"))

In [None]:
print_spearman_corrs(
    get_spearman_corrs_global(
        result_df[
            #
            (result_df["dataset"] == "protherm++")
            & (result_df["rev"] == False)
        ],
        eval_columns,
        "effect",
    )
)

In [None]:
print_spearman_corrs(
    get_spearman_corrs_global(
        result_df[
            #
            (result_df["dataset"] == "taipale_gpca")
            & (result_df["rev"] == False)
        ],
        eval_columns,
        "effect",
    )
)

In [None]:
result_df["dataset"].unique()

In [None]:
print_spearman_corrs(
    get_spearman_corrs_global(result_df[result_df["effect_type"] == "Deleteriousness score"], eval_columns, "effect")
)

In [None]:
result_df["dataset"].unique()

In [None]:
print_spearman_corrs(
    get_spearman_corrs_global(
        result_df[
            #
            (result_df["dataset"] == "taipale_ppi")
            & (result_df["rev"] == False)
        ],
        eval_columns,
        "effect",
    )
)

In [None]:
skempiskempi, ab_bind, taipale_ppi

In [None]:
print_spearman_corrs(
    get_spearman_corrs_global(result_df[result_df["effect_type"] == "ΔΔG"], eval_columns, "effect")
)

In [None]:
result_df[result_df["effect_type"] == "ΔΔG"]["dataset"].unique()

In [None]:
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats

set_matplotlib_formats("png")

In [None]:
FIGURE_OUTPUT_DIR = Path(f"05_model_validation_{COI}").resolve()
FIGURE_OUTPUT_DIR.mkdir(exist_ok=True)

FIGURE_OUTPUT_DIR

In [None]:
cmap = plt.cm.get_cmap("tab20")

In [None]:
result_df["dataset"].unique()

In [None]:
COI

In [None]:
dataset = "humsavar"
rev = [False, True]

effect_type = {"skempi++": "ΔΔG", "cagi4_sumo_ligase": "Deleteriousness score"}.get(
    dataset, "Deleteriousness class"
)

suffix = f"-{dataset}"
if rev != [False, True]:
    assert rev == [False]
    suffix += "-norev"


df = result_df[
    (result_df["effect_type"] == effect_type)
    & (result_df["dataset"] == dataset)
    & (result_df["rev"].isin(rev))
]


corrs = get_spearman_corrs_global(df, eval_columns, "effect")
fg, ax = plt.subplots(figsize=(6, 6))
x = np.arange(len(corrs))
y = [c[0] for c in corrs.values()]
out = ax.bar(x, y, color=cmap(1), edgecolor="k")
_ = ax.set_xticks(x)
_ = ax.set_xticklabels(corrs.keys(), rotation="vertical")
ax.set_ylabel("Spearman's ρ")
ax.set_title("Global correlations")
fg.subplots_adjust(top=0.95, right=0.98, bottom=0.38)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-global{suffix}.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-global{suffix}.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-global{suffix}.pdf"), dpi=300)


per_sequence_stats = get_spearman_corrs_perseq(df, eval_columns, "effect", min_gp_size=6)
fg, ax = plt.subplots(figsize=(6, 6))
out = ax.boxplot(
    per_sequence_stats.values(),
    patch_artist=True,
    boxprops={"facecolor": cmap(1)},
    medianprops={"color": cmap(0)},
)
bp = ax.set_xticklabels(per_sequence_stats.keys(), rotation="vertical")
ax.set_ylabel("Spearman's ρ")
ax.set_title("Per-protein correlations")
fg.subplots_adjust(top=0.95, right=0.98, bottom=0.38)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-perseq{suffix}.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-perseq{suffix}.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-perseq{suffix}.pdf"), dpi=300)

In [None]:
(df["rev"] == True).sum()

In [None]:
from IPython.display import SVG

SVG(FIGURE_OUTPUT_DIR.joinpath(f"corrs-global{suffix}.svg"))

In [None]:
!ls {FIGURE_OUTPUT_DIR}

In [None]:
df = result_df[
    (result_df["effect_type"] == "ΔΔG")
    & (result_df["dataset"] == "skempi++")
    & (result_df["rev"].isin([False]))
]

corrs = get_spearman_corrs_global(df, eval_columns, "effect")
fg, ax = plt.subplots()
x = np.arange(len(corrs))
y = [c[0] for c in corrs.values()]
out = ax.bar(x, y, color=cmap(1), edgecolor="k")
_ = ax.set_xticks(x)
_ = ax.set_xticklabels(corrs.keys(), rotation="vertical")
ax.set_ylabel("Spearman's ρ")
ax.set_title("Global correlations")
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-global-skempi-norev.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-global-skempi.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-global-skempi.pdf"), dpi=300)

per_sequence_stats = get_spearman_corrs_perseq(result_df, eval_columns, "effect", min_gp_size=6)
fg, ax = plt.subplots()
out = ax.boxplot(
    per_sequence_stats.values(),
    patch_artist=True,
    boxprops={"facecolor": cmap(1)},
    medianprops={"color": cmap(0)},
)
bp = ax.set_xticklabels(per_sequence_stats.keys(), rotation="vertical")
ax.set_ylabel("Spearman's ρ")
ax.set_title("Per-protein correlations")
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-perseq-skempi.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-perseq-skempi.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-perseq-skempi.pdf"), dpi=300)

In [None]:
print_spearman_stats(
    result_df[
        (result_df["effect_type"] == "Deleteriousness class")
        & (result_df["rev"].isin([True, False]))
    ],
    eval_columns,
    "effect",
)
# 0.488

In [None]:
result_df[
    (result_df["effect_type"] == "Deleteriousness class") & (result_df["rev"].isin([True, False]))
]["dataset"].unique()

In [None]:
print_spearman_stats(
    result_df[
        (result_df["effect_type"] == "Deleteriousness score")
        & (result_df["rev"].isin([True, False]))
    ],
    eval_columns,
    "effect",
)
# 0.4128

In [None]:
print_spearman_stats(result_df, ["ddg_pred", "rosetta_dg_change"], "label")  # 0.4646

In [None]:
print_spearman_stats(result_df[result_df["effect_type"] == "Deleteriousness score"], eval_columns, "label")  # 0.4077

In [None]:
print_spearman_stats(result_df[result_df["effect_type"] == "ΔΔG"], eval_columns, "effect")

In [None]:
def compute_per_sequence_stats(df, feature_columns, target_column, min_gp_size=6):
    df = df.dropna(subset=feature_columns + [target_column])
    results = {c: [] for c in feature_columns}
    for _, gp in df.groupby("unique_id"):
        if len(gp) < min_gp_size or len(set(gp[target_column])) < 2:
            continue
        for column in feature_columns:
            corr = stats.spearmanr(gp[column], gp[target_column])
            results[column].append(corr[0])
    return results

In [None]:
import matplotlib.pyplot as plt

In [None]:
per_sequence_stats = compute_per_sequence_stats(result_df, eval_columns, "effect", 6)

fg, ax = plt.subplots()

out = ax.boxplot(per_sequence_stats.values())
_ = ax.set_xticklabels(per_sequence_stats.keys(), rotation="vertical")
# ax.set_ylim(-1, 1)
# fg.tight_layout()

In [None]:
per_sequence_stats_ddg = compute_per_sequence_stats(
    result_df[result_df["effect_type"] == "Deleteriousness class"], eval_columns, "effect", 18
)

fg, ax = plt.subplots()

out = ax.boxplot(per_sequence_stats_ddg.values())
_ = ax.set_xticklabels(per_sequence_stats_ddg.keys(), rotation="vertical")
# ax.set_ylim(-1, 1)
# fg.tight_layout()

In [None]:
per_sequence_stats_ddg = compute_per_sequence_stats(
    result_df[result_df["effect_type"] == "Deleteriousness score"], eval_columns, "effect", 18
)

fg, ax = plt.subplots()

out = ax.boxplot(per_sequence_stats_ddg.values())
_ = ax.set_xticklabels(per_sequence_stats_ddg.keys(), rotation="vertical")
# ax.set_ylim(-1, 1)
# fg.tight_layout()

In [None]:
out.keys()

In [None]:
palette = ["r", "g", "b", "y"]
for x, val, c in zip(xs, vals, palette):
    plt.scatter(x, val, alpha=0.4, color=c)
plt.show()

In [None]:
train_df[(train_df["effect"] * 1_000).astype(np.int) > 300_000]

In [None]:
import matplotlib.pyplot as plt

_ = plt.hist(input_df["effect"], bins=100, range=(-5, 5))

In [None]:
param = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": 1000000000000,
    "max_bin": 255,
}


bst = lgb.train(param, train_ds, num_boost_round=100, valid_sets=[valid_ds])

In [None]:
ypred = bst.predict(test_df.drop(columns_to_drop, axis=1), num_iteration=bst.best_iteration)

In [None]:
ypred = bst.predict(test_df.drop(columns_to_drop, axis=1), num_iteration=bst.best_iteration)
test_df = test_df.copy()
test_df["ddg_pred"] = ypred

In [None]:
stats.spearmanr(test_df["effect"], test_df["ddg_pred"])

In [None]:
stats.spearmanr(test_df["effect"], test_df["foldx_score"])

In [None]:
stats.spearmanr(test_df["effect"], test_df["provean_score"])