## Summary

---

## Imports

In [None]:
import os
import shlex
import subprocess
import tempfile
from pathlib import Path
import optuna
import concurrent.futures
import itertools
import lightgbm
import json
import lightgbm as lgb
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import math
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.model_selection import PredefinedSplit
from tqdm.notebook import tqdm
import multiprocessing as mp

In [None]:
pd.set_option("max_columns", 1000)

## Paramters

In [None]:
NOTEBOOK_DIR = Path("04_train_model").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
COI = "interface"

In [None]:
DATASET_VERSION = "v2"

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic-v2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir

print(tempfile.gettempdir())

In [None]:
if COI == "core":
    datasets = [
        "elaspic-training-set-core",
        "protherm-dagger-core",
        "rocklin-2017-core",
        "dunham-2020-core",
        "starr-2020-core",
        "cagi5-frataxin-core",
        "huang-2020-core",
    ]
else:
    assert COI == "interface"
    datasets = [
        "elaspic-training-set-interface",
        "skempi-v2-interface",
#         "intact-mutations-interface",
        "dunham-2020-interface",
        "starr-2020-interface",
    ]

In [None]:
feature_generators = [
    "02_run_rosetta_ddg",
    "02_run_proteinsolver",
    "02_run_protbert",
]

### Load data

In [None]:
def expand_mutations(df):
    results = []
    for row in df.itertuples():
        for idx in range(len(row.mutation)):
            row_mut = {
                "unique_id": row.unique_id,
                "dataset": row.dataset,
                "name": row.name,
                "mutation": row.mutation[idx],
                "effect": row.effect[idx],
                "effect_type": row.effect_type,
            }
            for column in ["provean_score", "foldx_score", "elaspic_score"]:
                if hasattr(row, column):
                    row_mut[column] = getattr(row, column)[idx]
            results.append(row_mut)
    return pd.DataFrame(results)

In [None]:
def add_mutation_complement(df):
    df = df.copy()
    df["rev"] = False

    df_comp = df.copy()
    df_comp["rev"] = True
    df_comp["mutation"] = (
        df_comp["mutation"].str[-1] + df_comp["mutation"].str[1:-1] + df_comp["mutation"].str[0]
    )
    for column in ["effect", "provean_score", "foldx_score", "elaspic_score"]:
        if column in df_comp:
            df_comp[column] = -df_comp[column]
    for column in df_comp:
        if column.endswith("_wt"):
            column_mut = column[:-3] + "_mut"
            df_comp[column], df_comp[column_mut] = (
                df_comp[column_mut].copy(),
                df_comp[column].copy(),
            )

    df_out = pd.concat([df, df_comp], ignore_index=True)
    return df_out

In [None]:
tmp_df = pd.DataFrame(
    [[0, "M1A", 1.234, "wt score", "mut score"], [1, "M2C", -0.05, "wt score 2", "mut score 2"]],
    columns=["unique_id", "mutation", "effect", "feature_wt", "feature_mut"],
)

tmp2_df = add_mutation_complement(tmp_df)

display(tmp_df)
display(tmp2_df)

In [None]:
def merge_feature_dfs(feature_dfs):
    def _clean_df(df):
        df = df.copy()
        assert len(df) == len(df[["unique_id", "mutation"]].drop_duplicates())
        for column in ["effect", "effect_type", "provean_score", "foldx_score", "elaspic_score"]:
            if column in df:
                del df[column]
        return df

    if not feature_dfs:
        return None

    df = _clean_df(feature_dfs[0])
    for other_df in feature_dfs[1:]:
        df = df.merge(
            _clean_df(other_df), how="outer", on=["unique_id", "mutation", "rev"]
        )
    return df

In [None]:
input_data = {}
for dataset_name in datasets:
    input_file = OUTPUT_DIR.joinpath("01_load_data", f"{dataset_name}.parquet")
    pfile = pq.ParquetFile(input_file)
    task_count = pfile.num_row_groups
    df = pfile.read().to_pandas(integer_object_nulls=True)
    expanded_df = (
        add_mutation_complement(expand_mutations(df))
#         expand_mutations(df)
        .drop_duplicates(subset=["unique_id", "mutation"])
        .sort_values(["unique_id", "mutation"])
    )
#     expanded_df["rev"] = False
    sequence_df = df[["unique_id", "protein_sequence", "ligand_sequence"]].drop_duplicates()

    keys = set(tuple(x) for x in expanded_df[["unique_id", "mutation", "rev"]].values)
    
    features = {}
    for feature_generator in feature_generators:
        output_dir = OUTPUT_DIR.joinpath(feature_generator)
        feature_dfs = []
        for task_id in range(1, task_count + 1):
            output_file_template = "{dataset_name}-{task_prefix}{task_id}{task_suffix}-{task_count}.parquet"

            if feature_generator in ["02_run_rosetta_ddg"]:
                task_prefix_rev_list = [("wt2mut-", False), ("mut2wt-", True)]
            else:
                task_prefix_rev_list = [("", None)]

            for (task_prefix, rev) in task_prefix_rev_list:
                output_file_kwargs = dict(
                    dataset_name=dataset_name,
                    task_prefix=task_prefix,
                    task_id=task_id,
                    task_count=task_count,
                )
                output_file = OUTPUT_DIR.joinpath(
                    feature_generator,
                    output_file_template.format(task_suffix="", **output_file_kwargs)
                ).resolve()
                if output_file.is_file():
                    feature_df = pq.read_table(output_file).to_pandas(integer_object_nulls=True)
                else:
                    subtask_feature_dfs = []
                    subtask_missing_files = []
                    for subtask_idx in range(20):
                        subtask_output_file = OUTPUT_DIR.joinpath(
                            feature_generator,
                            output_file_template.format(task_suffix=string.ascii_lowercase[subtask_idx], **output_file_kwargs)
                        ).resolve()
                        if subtask_output_file.is_file():
                            feature_df = pq.read_table(subtask_output_file).to_pandas(integer_object_nulls=True)
                            subtask_feature_dfs.append(feature_df)
                        else:
                            subtask_missing_files.append(subtask_output_file)
                    if subtask_feature_dfs:
                        feature_df = pd.concat(subtask_feature_dfs, ignore_index=True)
                        if subtask_missing_files:
                            for subtask_missing_file in subtask_missing_files:
                                print(f"File {subtask_missing_file} is missing. Skipping...")
                    else:
                        print(f"File {output_file} is missing. Skipping...")
                        continue

                if feature_df.empty:
                    print(f"File {output_file} contains no data. Skipping...")
                    continue
                
                if rev in [True, False]:
                    feature_df["rev"] = rev
                else:
                    feature_df = add_mutation_complement(feature_df)
                if rev is True:
                    feature_df["unique_id"] = -feature_df["unique_id"].values
                assert not set(tuple(x) for x in feature_df[["unique_id", "mutation", "rev"]].values) - keys, (dataset_name, feature_generator, task_id)    
                feature_dfs.append(feature_df)

        if not feature_dfs:
            print(
                f"No data collected for dataset {dataset_name} and feature generator {feature_generator}."
            )
            continue

        final_feature_df = pd.concat(feature_dfs, ignore_index=True)
        features[feature_generator] = final_feature_df

    input_data[dataset_name] = {
        "expanded_df": expanded_df,
        "sequence_df": sequence_df,
        "feature_df": merge_feature_dfs(list(features.values())),
    }

### Merge together

In [None]:
expanded_df = pd.concat(
    [d["expanded_df"] for d in input_data.values() if d["feature_df"] is not None]
)

sequence_df = pd.concat(
    [d["sequence_df"] for d in input_data.values() if d["feature_df"] is not None]
)

features_df = pd.concat(
    [d["feature_df"] for d in input_data.values() if d["feature_df"] is not None]
).sort_values(["unique_id", "mutation"])
assert features_df["unique_id"].min() >= 0
len(features_df)

In [None]:
input_wn_df = expanded_df.merge(features_df, on=["unique_id", "mutation", "rev"], validate="1:1", how="outer")
# assert len(input_wn_df) == len(features_df), (len(expanded_df), len(features_df), len(input_wn_df))
assert input_wn_df["dataset"].notnull().all()
print(
    f"Lost {len(expanded_df) - len(features_df):,} out of {len(expanded_df):,} rows due to missing features."
)

# Correct the sign on some features
for dataset, effect_type in [
    ("protherm-dagger-core", "-ΔΔG"),
    ("rocklin-2017-core", "Stability score change"),
    ("dunham_2020_tianyu", "Deep mutation scan"),
    ("starr_2020_tianyu", "Deep mutation scan"),
]:
    mask = (input_wn_df["dataset"] == dataset) & (input_wn_df["effect_type"] == effect_type)
    if mask.any():
        print(f"Reversing sign for {dataset} ({effect_type})...")
        input_wn_df.loc[mask, "effect"] = -input_wn_df.loc[mask, "effect"]
        if effect_type == "-ΔΔG":
            input_wn_df.loc[mask, "effect_type"] = "ΔΔG"

len(input_wn_df)

In [None]:
columns = [c for c in input_wn_df if c.startswith("protbert_")]
input_wn_df[columns].isnull().sum()  # 194

In [None]:
columns = [c for c in input_wn_df if c.startswith("proteinsolver_")]
input_wn_df[columns].isnull().sum()  # 308

In [None]:
columns = [c for c in input_wn_df if c.startswith("rosetta_")]
input_wn_df[columns].isnull().sum().head()  # 79,025

### Remove rows with missing values

In [None]:
input_df = input_wn_df.dropna(
    subset=[
        c for c in input_wn_df if c.startswith("protbert_") or c.startswith("proteinsolver_")
    ]
)
print(
    f"Lost {len(input_wn_df) - len(input_df):,} out of {len(input_wn_df):,} rows due to missing features."
)

_before = len(input_df)
input_df = input_df[~input_df["effect"].isnull()]
print(
    f"Lost {_before - len(input_df):,} out of {_before:,} rows due to missing effect values."
)

input_df = input_df.copy()

len(input_df)  # Core: 642160

In [None]:
assert not input_df["effect"].isnull().any()

### Feature engineering

In [None]:
def assign_delta(input_df, column, column_ref, column_change):
    pca_columns = []
    value_sample = input_df[column].iloc[0]
    if isinstance(value_sample, (list, np.ndarray)):
        input_df[column_change] = input_df[column] - input_df[column_ref]
        return True
    else:
        input_df[column_change] = input_df[column] - input_df[column_ref]
        return False


pca_columns = []
for column in sorted(input_df):
    if column.endswith("_mut") and "_core2interface_" not in column:
        print(column, "(wt → mut)")
        column_ref = column[:-4] + "_wt"
        column_change = column[:-4] + "_change"
        if assign_delta(input_df, column, column_ref, column_change):
            pca_columns.extend([column_ref, column_change])

for column in sorted(input_df):
    if "_interface_" in column and not column.endswith("_mut"):
        print(column, "(core → interface)")
        column_ref = column.replace("_interface_", "_core_")
        column_change = column.replace("_interface_", "_core2interface_")
        if assign_delta(input_df, column, column_ref, column_change):
            pca_columns.extend([column_change])

pca_columns

### Remove invalid datasets

In [None]:
input_df["dataset"].value_counts()
# CORE
# cosmic                  469802
# ...

In [None]:
if COI == "core":
    datasets_to_drop = {
        "cagi4_sumo_ligase",
        "benedix_et_al",
        "hiv_escape_mutations",
        "ab_bind",
        "skempiskempi",
        "taipale_ppi",
        # "cosmic",
    }
else:
    datasets_to_drop = {
        "cagi4_sumo_ligase",
        "benedix_et_al",
        "hiv_escape_mutations",
        "taipale",
    }

input_df = input_df[~input_df["dataset"].isin(datasets_to_drop)]

input_df["dataset"].value_counts()

In [None]:
for (dataset, effect_type), gp in input_df.groupby(["dataset", "effect_type"]):
    gp = gp.copy()
    gp_sub = gp.dropna(subset=["effect", "protbert_core_score_change"])
    corr1 = stats.spearmanr(gp_sub["effect"], gp_sub["protbert_core_score_change"])
    gp_sub = gp_sub[gp_sub["rev"] == False]
    corr2 = stats.spearmanr(gp_sub["effect"], gp_sub["protbert_core_score_change"])
    if corr1[0] > 0 or corr2[0] > 0:
        print(dataset, effect_type)
        for column in [
            "provean_score",
            "foldx_score",
            "elaspic_score",
            "protbert_core_score_change",
            "proteinsolver_core_score_change",
        ]:
            gp_sub = gp.dropna(subset=["effect", column])
            corr = stats.spearmanr(gp_sub["effect"], gp_sub[column])
            print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")
            gp_sub = gp_sub[gp_sub["rev"] == False]
            corr = stats.spearmanr(gp_sub["effect"], gp_sub[column])
            print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")
        print()

In [None]:
for (dataset, effect_type), gp in input_df.groupby(["dataset", "effect_type"]):
    gp = gp.dropna(subset=["effect", "protbert_core_score_change"])
    assert len(gp)
    corr = stats.spearmanr(gp["effect"], gp["protbert_core_score_change"])
    assert corr[0] <= 0, (dataset, effect_type)

### Remove duplicates

In [None]:
humsavar_unique_ids = set(input_df[input_df["dataset"] == "humsavar"]["unique_id"].unique())
humsavar_sequences = set(tuple(s) for s in sequence_df[sequence_df["unique_id"].isin(humsavar_unique_ids)][["protein_sequence", "ligand_sequence"]].values)
len(input_df)  # 638184

In [None]:
clinvar_unique_ids = set(input_df[input_df["dataset"] == "clinvar"]["unique_id"].unique())
_before = len(clinvar_unique_ids)
clinvar_unique_ids = {
    uid for uid, pseq, lseq
    in sequence_df[sequence_df["unique_id"].isin(clinvar_unique_ids)][["unique_id", "protein_sequence", "ligand_sequence"]].values
    if (pseq, lseq) not in humsavar_sequences
}
print(f"Removed {_before - len(clinvar_unique_ids)} clinvar unique ids.")

input_df = input_df[(input_df["dataset"] != "clinvar") | (input_df["unique_id"].isin(clinvar_unique_ids))]
len(input_df)  # 617500

In [None]:
clinvar_sequences = set(tuple(s) for s in sequence_df[sequence_df["unique_id"].isin(clinvar_unique_ids)][["protein_sequence", "ligand_sequence"]].values)

cosmic_unique_ids = set(input_df[input_df["dataset"] == "cosmic"]["unique_id"].unique())
_before = len(cosmic_unique_ids)
cosmic_unique_ids = {
    uid for uid, pseq, lseq
    in sequence_df[sequence_df["unique_id"].isin(cosmic_unique_ids)][["unique_id", "protein_sequence", "ligand_sequence"]].values
    if (pseq, lseq) not in humsavar_sequences and (pseq, lseq) not in clinvar_sequences
}
print(f"Removed {_before - len(cosmic_unique_ids)} cosmic unique ids.")

input_df = input_df[(input_df["dataset"] != "cosmic") | (input_df["unique_id"].isin(cosmic_unique_ids))]
len(input_df)  # 516344

In [None]:
input_df["dataset"].value_counts()

### Cluster by sequence identity

In [None]:
def obtain_clusters(input_sequences, min_seq_id=0.3):
    with tempfile.TemporaryDirectory() as tmp_dir:
        input_dir = Path(tmp_dir, "input")
        input_dir.mkdir()

        output_dir = Path(tmp_dir, "output")
        output_dir.mkdir()

        scratch_dir = Path(tmp_dir, "scratch")
        scratch_dir.mkdir()

        with input_dir.joinpath("input.fasta").open("wt") as fout:
            for tup in input_sequences.itertuples():
                fout.write(f">{tup.unique_id}\n{tup.protein_sequence}\n")

        system_command = f"mmseqs easy-cluster --min-seq-id {min_seq_id} '{input_dir}/input.fasta' '{output_dir}/result' '{scratch_dir}'"
        print(system_command)

        proc = subprocess.run(shlex.split(system_command), capture_output=True, check=True)

        cluster_df = pd.read_csv(
            output_dir.joinpath("result_cluster.tsv"), sep="\t", names=["cluster_id", "unique_id"]
        )
        assert len(cluster_df) == len(cluster_df["unique_id"].unique())

    return cluster_df

In [None]:
input_sequences = sequence_df.merge(input_df[["unique_id"]].drop_duplicates())

len(input_sequences)  # CORE: 13779

In [None]:
cluster_df = obtain_clusters(input_sequences)

In [None]:
cluster_df.head()

In [None]:
if "cluster_id" in input_df:
    del input_df["cluster_id"]

input_df = input_df.merge(cluster_df, on="unique_id", how="outer", validate="m:1")
assert input_df["cluster_id"].notnull().all()

### Extract out independent test data

In [None]:
if COI == "core":
    test_datasets = {
        "starr_2020_tianyu",
        "huang_2020",
        "cagi5_frataxin",
    }
else:
    test_datasets = {
        "starr_2020_tianyu",
    }
input_test_df = input_df[input_df["dataset"].isin(test_datasets)].copy()

print(input_test_df["dataset"].unique())
print(len(input_test_df))

In [None]:
# test_cluster_ids = set(input_test_df["cluster_id"])  # TODO: 

input_train_df = input_df[~input_df["dataset"].isin(test_datasets)].copy()

print(input_train_df["dataset"].unique())
print(len(input_train_df))

### Train / validation split

In [None]:
import heapq
from dataclasses import dataclass, field
from typing import Any


def _update_mapping(df, mapping, num_folds):
    @dataclass(order=True)
    class PrioritizedItem:
        priority: int
        idx: int = field(compare=False)
        data: Any = field(compare=False)

    pq = [PrioritizedItem(0, i, []) for i in range(num_folds)]
    for cluster_id, gp in df.groupby("cluster_id"):
        if cluster_id in mapping:
            item_idx = mapping[cluster_id]
            item = next(item for item in pq if item.idx == item_idx)
            item.priority += len(gp)
            item.data.append(cluster_id)
            heapq.heapify(pq)
        else:
            item = heapq.heappop(pq)
            item.priority += len(gp)
            item.data.append(cluster_id)
            heapq.heappush(pq, item)

    for item in pq:
        for cluster_id in item.data:
            if cluster_id in mapping:
                assert mapping[cluster_id] == item.idx
            else:
                mapping[cluster_id] = item.idx
    
    return mapping


def map_to_test_fold(input_df, effect_types, num_folds):
    dfs = [input_df[input_df["effect_type"] == effect_type] for effect_type in effect_types]
    assert sum(len(df) for df in dfs) == len(input_df)

    mapping = {}
    for df in dfs:
        mapping = _update_mapping(df, mapping, num_folds)

    return mapping

In [None]:
input_train_df["effect_type"].unique()

In [None]:
if COI == "core":
    num_folds = 6
else:
    num_folds = 6

In [None]:
cluster_id_to_test_fold_mapping = map_to_test_fold(
    input_train_df,
    ["ΔΔG", "ΔΔG (from Kon/Koff)", "ΔΔG (from affinity)",
     "Stability score change",
     "Deep mutation scan",
     "Deleteriousness score", "Deleteriousness class"], num_folds=num_folds)
input_train_df["test_fold"] = input_train_df["cluster_id"].map(cluster_id_to_test_fold_mapping)
assert input_train_df["test_fold"].notnull().all()
assert len(input_train_df["test_fold"].unique()) == num_folds

In [None]:
input_train_df["test_fold"].value_counts()

### Train PCA models

In [None]:
n_components = 10
for column in pca_columns:
    print(column)
    values = np.vstack(input_train_df[column].values)

    pickle_file = NOTEBOOK_DIR.joinpath(f"pca-{column}-{COI}.pickle")
    if pickle_file.is_file():
        pca = torch.load(pickle_file)
    else:
        pca = PCA(n_components=n_components)
        pca.fit(values)
        torch.save(pca, pickle_file)

    values_out = pca.transform(values)
    for i in range(n_components):
        new_column = f"{column}_{i}_pc"
        input_train_df[new_column] = values_out[:, i]

In [None]:
train_test_splits = []
ps = PredefinedSplit(input_train_df["test_fold"])
for split_idx, (train, test) in enumerate(tqdm(ps.split(), total=n_components)):
    train_df = input_train_df.iloc[train].sample(frac=1.0, replace=False).sort_values(["unique_id"]).copy()
    test_df = input_train_df.iloc[test].sample(frac=1.0, replace=False).sort_values(["unique_id"]).copy()
    assert not set(train_df["cluster_id"]) & set(test_df["cluster_id"])

    first_row = train_df.iloc[0]
    for column in list(train_df):
        value = first_row[column]
        if isinstance(value, (list, tuple, np.ndarray)):
            del train_df[column], test_df[column]

    train_test_splits.append((train_df, test_df))

### Save results

In [None]:
NOTEBOOK_DIR

In [None]:
with NOTEBOOK_DIR.parent.joinpath("04_train_model", f"pca-columns-{COI}.{DATASET_VERSION}.parquet").open("wt") as fout:
    json.dump(pca_columns, fout)

In [None]:
output_file = NOTEBOOK_DIR.parent.joinpath("04_train_model", f"sequences-{COI}.{DATASET_VERSION}.parquet")

pq.write_table(
    pa.Table.from_pandas(sequence_df, preserve_index=False),
    output_file,
    row_group_size=1_000,
)

In [None]:
output_file = NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-train-{COI}.{DATASET_VERSION}.parquet")

pq.write_table(
    pa.Table.from_pandas(input_train_df, preserve_index=False),
    output_file,
    row_group_size=10_000,
)

In [None]:
output_file = NOTEBOOK_DIR.parent.joinpath("04_train_model", f"input-test-{COI}.{DATASET_VERSION}.parquet")

pq.write_table(
    pa.Table.from_pandas(input_test_df, preserve_index=False),
    output_file,
    row_group_size=10_000,
)

In [None]:
# for idx, (train_df, test_df) in enumerate(train_test_splits):
#     print(idx)

#     output_file = NOTEBOOK_DIR.parent.joinpath("04_train_model", f"xval-train-{COI}-{idx}.{DATASET_VERSION}.parquet")
#     pq.write_table(
#         pa.Table.from_pandas(train_df, preserve_index=False),
#         output_file,
#         row_group_size=10_000,
#     )
    
#     output_file = NOTEBOOK_DIR.parent.joinpath("04_train_model", f"xval-test-{COI}-{idx}.{DATASET_VERSION}.parquet")
#     pq.write_table(
#         pa.Table.from_pandas(test_df, preserve_index=False),
#         output_file,
#         row_group_size=10_000,
#     )

### Optimize labels

In [None]:
feature_columns = [
    c
    for c in list(train_test_splits[0][0])
    if (c.endswith("_wt") or c.endswith("_mut") or c.endswith("_change") or c.endswith("_pc"))
    and not (c.endswith("dg_change") or c.startswith("rosetta_"))
]

# feature_columns

In [None]:
other_columns = [c for c in list(train_test_splits[0][0]) if c not in feature_columns]

# other_columns

In [None]:
def get_label(df):
    effect = df["effect"].values.copy()

    mask = df["effect_type"].str.startswith("ΔΔG")
    effect[mask] *= 0.8

    mask = df["effect_type"] == "Deleteriousness class"
    effect[mask] *= 1

    mask = df["effect_type"] == "Stability score change"
    effect[mask] *= 5

    mask = df["effect_type"] == "Deleteriousness score"
    if mask.any():
        assert effect[mask].min() >= -5 and effect[mask].max() <= 5

    mask = df["effect_type"] == "Deep mutation scan"
    effect[mask] *= 4

    effect = np.rint(np.clip(effect, -5, 5) * 100 + 500)
    return effect

In [None]:
input_train_df["effect_type"].value_counts()

In [None]:
_ = plt.hist(get_label(input_train_df), bins=100)

In [None]:
_ = plt.hist(get_label(input_train_df[input_train_df["effect_type"] == 'Deleteriousness score']), bins=100)

In [None]:
_ = plt.hist(get_label(input_train_df[input_train_df["effect_type"] == 'Stability score change']), bins=100)

In [None]:
_ = plt.hist(get_label(input_train_df[input_train_df["effect_type"] == 'Deep mutation scan']), bins=100)

In [None]:
_ = plt.hist(get_label(input_train_df[input_train_df["effect_type"] == 'ΔΔG']), bins=100)

In [None]:
_ = plt.hist(get_label(input_train_df[input_train_df["effect_type"] == 'ΔΔG (from affinity)']), bins=100)

In [None]:
_ = plt.hist(get_label(input_train_df[input_train_df["effect_type"] == 'ΔΔG (from Kon/Koff)']), bins=100)

### Optimize groups

In [None]:
def assert_get_group_valid(df):
    assert df["unique_id"].is_monotonic_increasing
    
    prev = None
    for unique_id, rev in df[['unique_id', "rev"]].values:
        if prev is not None:
            if not rev:
                assert unique_id != prev[0] or not prev[1], (unique_id, rev, prev)
            else:
                assert unique_id == prev[0]
        prev = (unique_id, rev)

In [None]:
def get_group(df, max_group_size=100):
    assert df["unique_id"].is_monotonic_increasing
    vc = df["unique_id"].value_counts()
    groups = [vc[uid] for uid in df["unique_id"].unique()]
    if max_group_size:
        old_groups, groups = groups, []
        for idx, group in enumerate(old_groups):
            if group <= max_group_size:
                groups.append(group)
            else:
                num_subgroups = math.ceil(group / max_group_size)
                num_per_group = math.floor(group / num_subgroups)
                subgroups = [num_per_group] * num_subgroups
                if (remainder := group - sum(subgroups)):
                    assert remainder < num_subgroups
                    for remainder_idx in range(remainder):
                        subgroups[remainder_idx] += 1
                groups.extend(subgroups)
    assert sum(groups) == len(df), (sum(groups), len(df))
    assert not max_group_size or max(groups) <= max_group_size
    return np.array(groups)

In [None]:
if COI == "core":
    max_group_size = 100
else:
    max_group_size = 100

In [None]:
plt.hist(np.clip(get_group(input_train_df.sort_values(["unique_id"]), max_group_size), 0, max_group_size), bins=100)
None

### Train model

In [None]:
def train_model(input, param, early_stopping_rounds=10):
    train_df, test_df = input

    train_ds = lgb.Dataset(
        train_df[feature_columns],
        label=get_label(train_df),
        group=get_group(train_df, max_group_size=max_group_size),
    )

    valid_ds = lgb.Dataset(
        test_df[feature_columns],
        label=get_label(test_df),
        group=get_group(test_df, max_group_size=max_group_size),
        reference=train_ds,
    )

    bst = lgb.train(
        param,
        train_ds,
        valid_sets=[valid_ds],
        num_boost_round=100,
        verbose_eval=False,
        # feval=my_feval,
        # early_stopping_rounds=early_stopping_rounds,
    )

    return bst

In [None]:
skempi_unique_ids = set(input_train_df[input_train_df["dataset"] == "skempi++"]["unique_id"].unique())
skempi_sequences = set(tuple(s) for s in sequence_df[sequence_df["unique_id"].isin(skempi_unique_ids)][["protein_sequence", "ligand_sequence"]].values)

skempi_v2_unique_ids = set(input_train_df[input_train_df["dataset"] == "skempi-v2"]["unique_id"].unique())
skempi_v2_unique_ids = {
    uid for uid, pseq, lseq
    in sequence_df[sequence_df["unique_id"].isin(skempi_v2_unique_ids)][["unique_id", "protein_sequence", "ligand_sequence"]].values
    if (pseq, lseq) not in skempi_sequences
}


def get_aggregate_spearmanr(result_df, datasets):
    corrs = []
    for dataset, effect_type, *_ in datasets:
        df = result_df[
            (result_df["dataset"] == dataset)
            & (result_df["effect_type"] == effect_type)
            & (result_df["rev"] == False)
        ]

        if dataset == "skempi-v2":
            df = df[df["unique_id"].isin(skempi_v2_unique_ids)]

        df = df.dropna(subset=["effect", "ddg_pred"])
        
        corr = stats.spearmanr(df["effect"], df["ddg_pred"])[0]
        corrs.append(corr)
    return sum(corrs) / len(corrs)

In [None]:
if COI == "core":
    columns_full = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_dg_change",
    ]

    datasets_eval = [
        ["protherm++", "ΔΔG", columns_full],
        ["humsavar", "Deleteriousness class", columns_full],
        ["clinvar", "Deleteriousness class", columns_full],
        ["cosmic", "Deleteriousness class", columns_full],
        ["taipale", "ΔΔG", columns_full],
        # ["taipale_gpca", "ΔΔG", columns_full],
        # ["cagi5_frataxin", "ΔΔG", ["ddg_pred"]],
        ["rocklin-2017-core", "Stability score change", ["ddg_pred", "rosetta_dg_change"]],
        ["dunham_2020_tianyu", "Deep mutation scan", ["ddg_pred", "rosetta_dg_change"]],
        # ["protherm-dagger-core", "ΔΔG", ["ddg_pred", "rosetta_dg_change"]],
    ]
else:
    columns_full = [
        "ddg_pred",
        "elaspic_score",
        "foldx_score",
        "rosetta_complex_dg_change",
    ]

    datasets_eval = [
        ["skempi++", "ΔΔG", columns_full],
        ["humsavar", "Deleteriousness class", columns_full],
        ["clinvar", "Deleteriousness class", columns_full],
        ["cosmic", "Deleteriousness class", columns_full],
        ["ab_bind", "ΔΔG", ["ddg_pred", "elaspic_score", "foldx_score"]],
        # ["taipale", "ΔΔG", eval_columns],
        ["skempi-v2", "ΔΔG (from affinity)", ["ddg_pred", "rosetta_complex_dg_change"]],
        # ["skempi-v2", "ΔΔG (from Kon/Koff)", ["ddg_pred", "rosetta_complex_dg_change"]],
        ["dunham_2020_tianyu", "Deep mutation scan", ["ddg_pred", "rosetta_complex_dg_change"]],
    ]


In [None]:
const_param = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "verbosity": -1,
    "eval_at": 1_000_000,
    "label_gain": [np.log2(2 + i) for i in range(0, 1_001)],
    "force_col_wise": True,
    "num_threads": 40,
}

In [None]:
if COI =="core":
    best_params = {'num_leaves': 131, 'lambda_l1': 0.06090843013079758, 'lambda_l2': 1.682306739340599, 'feature_fraction': 0.6427647079708247, 'bagging_fraction': 0.5908679308527225, 'bagging_freq': 6, 'min_child_samples': 47}
else:
    best_params = {
        'max_bin': 511,
        'num_leaves': 64,
        'min_data_in_leaf': 168,
        'lambda_l1': 1.8149466697376564e-05,
        'lambda_l2': 4.3022548294881256e-07,
        'feature_fraction': 0.6326206839855546,
        'bagging_fraction': 0.7398095524057099,
        'bagging_freq': 6,
    }
    
best_params = {"max_bin": 255, "learning_rate": 0.1}

In [None]:
param = {
    **const_param,
    # **{"max_bin": 255, "learning_rate": 0.1, "force_col_wise": True},
    **best_params,
    # **study.best_params,
    "num_threads": 80,
    "verbosity": 1,
}

start_time = time.perf_counter()
bsts = []
result_dfs = []
for split_idx, (train_df, test_df) in enumerate(train_test_splits):
    print(split_idx, len(train_df), len(test_df))

    assert not set(train_df["cluster_id"]) & set(test_df["cluster_id"])
    bst = train_model((train_df, test_df), param, early_stopping_rounds=10)
    bsts.append(bst)

    test_df = test_df.copy()
    test_df["ddg_pred"] = bst.predict(
        test_df[feature_columns], num_iteration=bst.best_iteration
    )
    result_dfs.append(test_df)
result_df = pd.concat(result_dfs, ignore_index=True)
print(f"Elaspsed: {time.perf_counter() - start_time}.")

In [None]:
score = get_aggregate_spearmanr(result_df, datasets_eval)
score
# Interface: 0.325
# Core: 0.3565635315814614

In [None]:
len(feature_columns)

In [None]:
import json

with open(f"05_feature_elimination/feature-columns-{COI}-0.json", "wt") as fout:
    json.dump(feature_columns, fout)

In [None]:
import json

with open(f"05_feature_elimination/feature-columns-interface-0.json", "rt") as fin:
    print(len(json.load(fin)))

In [None]:
for split_idx, bst in enumerate(tqdm(bsts, total=n_components)):
    print(split_idx)

    for column in pca_columns:
        pickle_file = NOTEBOOK_DIR.joinpath(f"pca-{column}-{COI}.pickle")
        pca = torch.load(pickle_file)

        values = np.vstack(input_test_df[column].values)
        values_out = pca.transform(values)
        for i in range(n_components):
            new_column = f"{column}_{i}_pc"
            input_test_df[new_column] = values_out[:, i]

    input_test_df[f"ddg_pred_{split_idx}"] = bst.predict(
        input_test_df[feature_columns], num_iteration=bst.best_iteration
    )

In [None]:
input_test_df[f"ddg_pred"] = input_test_df[[f"ddg_pred_{i}" for i in range(6)]].max(axis=1)

In [None]:
def get_spearman_corrs_global(df, feature_columns, target_column, drop_na=True):
    if drop_na:
        df = df.dropna(subset=feature_columns + [target_column])
    corrs = {}
    for column in feature_columns:
        sign = -1 if any(column.startswith(prefix) for prefix in ["provean_", "protbert_", "proteinsolver_"]) else 1
        df_nna = df.dropna(subset=[column, target_column])
        corr = stats.spearmanr(sign * df_nna[column], df_nna[target_column])
        corrs[column] = (corr[0], corr[1], len(df_nna))
        # print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")
    return corrs

In [None]:
def get_spearman_corrs_perseq(df, feature_columns, target_column, min_gp_size=6, drop_na=True):
    if drop_na:
        df = df.dropna(subset=feature_columns + [target_column])
    results = {c: [] for c in feature_columns}
    for _, gp in df.groupby("unique_id"):
        if len(gp) < min_gp_size or len(set(gp[target_column])) < 2:
            continue
        for column in feature_columns:
            sign = -1 if any(column.startswith(prefix) for prefix in ["provean_", "protbert_", "proteinsolver_"]) else 1
            gp_nna = gp.dropna(subset=[column, target_column])
            corr = stats.spearmanr(sign * gp_nna[column], gp_nna[target_column])
            results[column].append(corr[0])
    return results

In [None]:
def print_spearman_corrs(corrs):
    for column, corr in corrs.items():
        print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4} ({corr[2]})")

In [None]:
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats

set_matplotlib_formats("png")

In [None]:
FIGURE_OUTPUT_DIR = Path(f"05_model_validation_{COI}").resolve()
FIGURE_OUTPUT_DIR.mkdir(exist_ok=True)

FIGURE_OUTPUT_DIR

In [None]:
cmap = plt.cm.get_cmap("tab20")

In [None]:
result_df[["dataset", "effect_type"]].drop_duplicates()

In [None]:
def get_spearman_corrs_global_xxx(df, feature_columns, target_column, drop_na=True):
    if drop_na:
        df = df.dropna(subset=feature_columns + [target_column])
    corrs = {}
    for column in feature_columns:
        sign = -1 if any(column.startswith(prefix) for prefix in ["provean_", "protbert_", "proteinsolver_"]) else 1
        df_nna = df.dropna(subset=[column, target_column])
        feature_values = sign * df_nna[column].values
        feature_values = np.hstack([feature_values, -feature_values])
        target_values = df_nna[target_column]
        target_values = np.hstack([target_values, -target_values])
        corr = stats.spearmanr(feature_values, target_values)
        corrs[column] = (corr[0], corr[1], len(df_nna))
        # print(f"{column:30s} {corr[0]:+.4} {corr[1]:.4}")
    return corrs

In [None]:
rev = [False]


if rev == [False]:
    suffix = ""
else:
    assert rev == [False, True]
    suffix = "-rev" 
    
    
from matplotlib.ticker import FormatStrFormatter

fg, axs = plt.subplots(2, len(datasets_eval), figsize=(12, 8))

for idx, (dataset, effect_type, eval_columns) in enumerate(datasets_eval):
    df = result_df[
        (result_df["effect_type"] == effect_type)
        & (result_df["dataset"] == dataset)
        & (result_df["rev"].isin(rev))
    ]
    
    if dataset == "skempi-v2":
        df = df[df["unique_id"].isin(skempi_v2_unique_ids)]

    corrs = get_spearman_corrs_global(df, eval_columns, "effect")
    per_sequence_stats = get_spearman_corrs_perseq(df, eval_columns, "effect", min_gp_size=8)

    ax = axs[0, idx]
    x = np.arange(len(corrs))
    y = [c[0] for c in corrs.values()]
    out = ax.bar(x, y, color=cmap(1), edgecolor="k")
    _ = ax.set_xticks(x)
    _ = ax.set_xticklabels([""] * len(x), rotation="vertical")
    ax.set_title(f"{dataset}")
    ax.set_ylim(-0.025, 0.825)
    if idx == 0:
        ax.set_ylabel("Global Spearman's ρ")
        ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
        
    ax = axs[1, idx]
    out = ax.boxplot(
        per_sequence_stats.values(),
        patch_artist=True,
        boxprops={"facecolor": cmap(1)},
        medianprops={"color": cmap(0)},
    )
    bp = ax.set_xticklabels(per_sequence_stats.keys(), rotation="vertical")
    ax.set_ylim(-1.05, 1.05)
    if idx == 0:
        ax.set_ylabel("Per-protein Spearman's ρ")
        ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

fg.subplots_adjust(top=0.95, right=0.98, bottom=0.38)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-xval-{COI}{suffix}.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-xval-{COI}{suffix}.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-xval-{COI}{suffix}.pdf"), dpi=300)

In [None]:
plt.plot(df[df["rev"] == False]["effect"], df[df["rev"] == False]["ddg_pred"], 'r.', alpha=0.3)
plt.plot(-df[df["rev"] == False]["effect"], -df[df["rev"] == False]["ddg_pred"], 'g.', alpha=0.3)
plt.plot(df[df["rev"] == True]["effect"], df[df["rev"] == True]["ddg_pred"], 'b.', alpha=0.3)
plt.xlabel("effect")

In [None]:
if COI == "core":
    eval_columns = [
        "ddg_pred",
#         "elaspic_score",
#         "foldx_score",
#         "rosetta_dg_change",
#         "provean_score",
        "protbert_core_score_change",
        "proteinsolver_core_score_change",
    ]
else:
    eval_columns = [
        "ddg_pred",
#         "elaspic_score",
#         "foldx_score",
#         "rosetta_complex_dg_change",
#         "provean_score",
        "protbert_core_score_change",
        "proteinsolver_core_score_change",
        #
#         "rosetta_opt_apart_dg_change",
#         "rosetta_apart_dg_change",
#         "rosetta_opt_bind_dg_change",
#         "rosetta_bind_dg_change",
    ]

In [None]:
dataset, effect_type = ("huang_2020", "ΔΔG")
dataset, effect_type = ("starr_2020_tianyu", "Deep mutation scan")
# dataset, effect_type = ("cagi5_frataxin", "ΔΔG")

rev = [False, True]

# df = result_df[
#     (result_df["effect_type"] == effect_type)
#     & (result_df["dataset"] == dataset)
#     & (result_df["rev"].isin(rev))
# ]

df = input_test_df[
    (input_test_df["effect_type"] == effect_type)
    & (input_test_df["dataset"] == dataset)
    & (input_test_df["rev"].isin(rev))
]



suffix = f"-{dataset}"
if rev != [False, True]:
    assert rev == [False]
    suffix += "-norev"

    
    
    
from matplotlib.ticker import FormatStrFormatter




corrs = get_spearman_corrs_global(df, eval_columns, "effect")
per_sequence_stats = get_spearman_corrs_perseq(df, eval_columns, "effect", min_gp_size=6)

fg, axs = plt.subplots(2, 1, figsize=(3, 8))

ax = axs[0]
x = np.arange(len(corrs))
y = [c[0] for c in corrs.values()]
out = ax.bar(x, y, color=cmap(1), edgecolor="k")
_ = ax.set_xticks(x)
_ = ax.set_xticklabels([""] * len(x), rotation="vertical")
ax.set_ylabel("Global Spearman's ρ")
ax.set_title(f"{dataset} - {effect_type}")
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

ax = axs[1]
out = ax.boxplot(
    per_sequence_stats.values(),
    patch_artist=True,
    boxprops={"facecolor": cmap(1)},
    medianprops={"color": cmap(0)},
)
bp = ax.set_xticklabels(per_sequence_stats.keys(), rotation="vertical")
ax.set_ylabel("Per-protein Spearman's ρ")
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

fg.subplots_adjust(top=0.95, right=0.98, bottom=0.38)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-perseq{suffix}.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-perseq{suffix}.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath(f"corrs-perseq{suffix}.pdf"), dpi=300)

In [None]:
plt.plot(df[df["rev"] == False]["effect"], df[df["rev"] == False]["ddg_pred"], 'r.', alpha=0.3)
# plt.plot(-df[df["rev"] == False]["effect"], -df[df["rev"] == False]["ddg_pred"], 'g.', alpha=0.3)
# plt.plot(df[df["rev"] == True]["effect"], df[df["rev"] == True]["ddg_pred"], 'b.', alpha=0.3)

In [None]:
df2 = df[df["rev"] == False][["mutation", "ddg_pred"]]
df2["mutation"] = df2["mutation"].str[-1] + df2["mutation"].str[1:-1] + df2["mutation"].str[0]
df2["ddg_pred"] = -df2["ddg_pred"]
df2 = df2.merge(df[df["rev"] == True][["mutation", "ddg_pred"]], on=["mutation"])
df2.head()

In [None]:
stats.spearmanr(df2["ddg_pred_x"], df2["ddg_pred_y"])

In [None]:
plt.plot(df[df["rev"] == False]["effect"], df[df["rev"] == False]["ddg_pred"], 'r.', alpha=0.3)
plt.plot(df[df["rev"] == True]["effect"], df[df["rev"] == True]["ddg_pred"], 'b.', alpha=0.3)

In [None]:
plt.hist(df[df["rev"] == False]["effect"], bins=40)

In [None]:
def my_feval(preds, train_data):
    labels = train_data.get_label()
    groups = train_data.get_group()
    
    if len(set(preds)) < 2 or len(set(labels)) < 2:
        global_corr = 0
    else:
        global_corr = stats.spearmanr(preds, labels)[0]
    
    weighted_corr_total = 0
    weight_total = 0
    start = 0
    for group in groups:
        stop = start + group
        preds_slice = preds[start:stop]
        labels_slice = labels[start:stop]
        start = stop

        weight = math.sqrt(group)
        if group < 2:
            continue
        elif len(set(labels_slice)) < 2:
            continue
        elif len(set(preds_slice)) < 2:
            group_corr = 0
        else:
            group_corr =  stats.spearmanr(preds_slice, labels_slice)[0]
        weighted_corr_total += weight * group_corr
        weight_total += weight
    assert start == sum(groups)
    pergroup_corr = weighted_corr_total / weight_total
        
    eval_name = "wavg_spearman_rho"
    # eval_result = (global_corr / pergroup_corr) / 2
    eval_result = pergroup_corr
    is_higher_better = True
    return eval_name, eval_result, is_higher_better

In [None]:
def calculate_score(df):
    corr_global = stats.spearmanr(df["ddg_pred"], df["effect"])[0]
    
    perseq_score = 0
    perseq_weight = 0
    for _, gp in df.groupby("unique_id"):
        if len(set(gp["effect"])) < 2:
            continue
        elif len(set(gp["ddg_pred"])) < 2:
            weight = math.sqrt(len(gp))
            corr = 0
        else:
            weight = math.sqrt(len(gp))
            corr = stats.spearmanr(gp["ddg_pred"], gp["effect"])[0]
        perseq_score += corr * weight
        perseq_weight += weight
    corr_perseq = perseq_score / perseq_weight
    
    return (corr_global + corr_perseq) / 2

In [None]:
df = result_df[
    (result_df["effect_type"] == "ΔΔG")
    & (result_df["dataset"] == "skempi++")
    & (result_df["rev"].isin([False]))
]

corrs = get_spearman_corrs_global(df, eval_columns, "effect")
fg, ax = plt.subplots()
x = np.arange(len(corrs))
y = [c[0] for c in corrs.values()]
out = ax.bar(x, y, color=cmap(1), edgecolor="k")
_ = ax.set_xticks(x)
_ = ax.set_xticklabels(corrs.keys(), rotation="vertical")
ax.set_ylabel("Spearman's ρ")
ax.set_title("Global correlations")
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-global-skempi-norev.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-global-skempi.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-global-skempi.pdf"), dpi=300)

per_sequence_stats = get_spearman_corrs_perseq(result_df, eval_columns, "effect", min_gp_size=6)
fg, ax = plt.subplots()
out = ax.boxplot(
    per_sequence_stats.values(),
    patch_artist=True,
    boxprops={"facecolor": cmap(1)},
    medianprops={"color": cmap(0)},
)
bp = ax.set_xticklabels(per_sequence_stats.keys(), rotation="vertical")
ax.set_ylabel("Spearman's ρ")
ax.set_title("Per-protein correlations")
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-perseq-skempi.svg"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-perseq-skempi.png"), dpi=300)
fg.savefig(FIGURE_OUTPUT_DIR.joinpath("corrs-perseq-skempi.pdf"), dpi=300)

In [None]:
print_spearman_stats(
    result_df[
        (result_df["effect_type"] == "Deleteriousness class")
        & (result_df["rev"].isin([True, False]))
    ],
    eval_columns,
    "effect",
)
# 0.488

In [None]:
result_df[
    (result_df["effect_type"] == "Deleteriousness class") & (result_df["rev"].isin([True, False]))
]["dataset"].unique()

In [None]:
print_spearman_stats(
    result_df[
        (result_df["effect_type"] == "Deleteriousness score")
        & (result_df["rev"].isin([True, False]))
    ],
    eval_columns,
    "effect",
)
# 0.4128

In [None]:
print_spearman_stats(result_df, ["ddg_pred", "rosetta_dg_change"], "label")  # 0.4646

In [None]:
print_spearman_stats(result_df[result_df["effect_type"] == "Deleteriousness score"], eval_columns, "label")  # 0.4077

In [None]:
print_spearman_stats(result_df[result_df["effect_type"] == "ΔΔG"], eval_columns, "effect")

In [None]:
def compute_per_sequence_stats(df, feature_columns, target_column, min_gp_size=6):
    df = df.dropna(subset=feature_columns + [target_column])
    results = {c: [] for c in feature_columns}
    for _, gp in df.groupby("unique_id"):
        if len(gp) < min_gp_size or len(set(gp[target_column])) < 2:
            continue
        for column in feature_columns:
            corr = stats.spearmanr(gp[column], gp[target_column])
            results[column].append(corr[0])
    return results

In [None]:
import matplotlib.pyplot as plt

In [None]:
per_sequence_stats = compute_per_sequence_stats(result_df, eval_columns, "effect", 6)

fg, ax = plt.subplots()

out = ax.boxplot(per_sequence_stats.values())
_ = ax.set_xticklabels(per_sequence_stats.keys(), rotation="vertical")
# ax.set_ylim(-1, 1)
# fg.tight_layout()

In [None]:
per_sequence_stats_ddg = compute_per_sequence_stats(
    result_df[result_df["effect_type"] == "Deleteriousness class"], eval_columns, "effect", 18
)

fg, ax = plt.subplots()

out = ax.boxplot(per_sequence_stats_ddg.values())
_ = ax.set_xticklabels(per_sequence_stats_ddg.keys(), rotation="vertical")
# ax.set_ylim(-1, 1)
# fg.tight_layout()

In [None]:
per_sequence_stats_ddg = compute_per_sequence_stats(
    result_df[result_df["effect_type"] == "Deleteriousness score"], eval_columns, "effect", 18
)

fg, ax = plt.subplots()

out = ax.boxplot(per_sequence_stats_ddg.values())
_ = ax.set_xticklabels(per_sequence_stats_ddg.keys(), rotation="vertical")
# ax.set_ylim(-1, 1)
# fg.tight_layout()

In [None]:
out.keys()

In [None]:
palette = ["r", "g", "b", "y"]
for x, val, c in zip(xs, vals, palette):
    plt.scatter(x, val, alpha=0.4, color=c)
plt.show()

In [None]:
train_df[(train_df["effect"] * 1_000).astype(np.int) > 300_000]

In [None]:
import matplotlib.pyplot as plt

_ = plt.hist(input_df["effect"], bins=100, range=(-5, 5))

In [None]:
param = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": 1000000000000,
    "max_bin": 255,
}


bst = lgb.train(param, train_ds, num_boost_round=100, valid_sets=[valid_ds])

In [None]:
ypred = bst.predict(test_df.drop(columns_to_drop, axis=1), num_iteration=bst.best_iteration)

In [None]:
ypred = bst.predict(test_df.drop(columns_to_drop, axis=1), num_iteration=bst.best_iteration)
test_df = test_df.copy()
test_df["ddg_pred"] = ypred

In [None]:
stats.spearmanr(test_df["effect"], test_df["ddg_pred"])

In [None]:
stats.spearmanr(test_df["effect"], test_df["foldx_score"])

In [None]:
stats.spearmanr(test_df["effect"], test_df["provean_score"])