## Summary

---

## Imports

In [None]:
import functools
from pathlib import Path

import elaspic2 as el2
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics, model_selection
from sklearn.decomposition import PCA
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

In [None]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [None]:
NOTEBOOK_DIR = Path("38_cagi6_sherloc_train_model").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

## Load data

In [None]:
submission_template_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "submission_template.tsv"
)

submission_template_df = pd.read_csv(submission_template_file, sep="\t")

display(submission_template_df.head(2))
len(submission_template_df)

### `training_df`

In [None]:
training_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc",
    "CAGI6-Sherloc-clinical-classification",
    "final_train_070821.txt",
)

In [None]:
training_all_df = pd.read_csv(training_file, sep="\t")

display(training_all_df.head(2))
len(training_all_df)

In [None]:
assert not set(submission_template_df["hgvs"]) & set(training_all_df["hgvs"])

In [None]:
training_df = training_all_df[training_all_df["Subcategory Missense"]]

display(training_df.head(2))
len(training_df)

### `testing_df`

In [None]:
testing_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "CAGI6-Sherloc-clinical-classification", "final_test_070821.txt"
)

In [None]:
testing_all_df = pd.read_csv(testing_file, sep="\t")

display(testing_all_df.head(2))
len(testing_all_df)

In [None]:
assert not set(submission_template_df["hgvs"]) ^ set(testing_all_df["hgvs"])

In [None]:
testing_df = testing_all_df[testing_all_df["Subcategory Missense"]]

display(testing_df.head(2))
len(testing_df)

### `validation_df`

In [None]:
validation_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "validation_variants.tsv"
)

In [None]:
validation_all_df = pd.read_csv(validation_file, sep="\t").rename(
    columns={"HGVS.c": "hgvs_g"}
)

display(validation_all_df.tail(2))
len(validation_all_df)

In [None]:
def map_g_to_c(str_g, mapper):
    from hgvs.exceptions import HGVSInvalidIntervalError, HGVSUsageError

    var_g = hp.parse_hgvs_variant(str_g)
    for tx_ac in mapper.relevant_transcripts(var_g):
        try:
            var_c = mapper.g_to_c(var_g, tx_ac)
        except (HGVSUsageError, HGVSInvalidIntervalError):
            continue
        yield var_c

In [None]:
validation_mapping_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "validation-mapping-grch37.parquet"
)

if validation_mapping_file.is_file():
    validation_mapping_df = pq.read_table(validation_mapping_file).to_pandas()
else:
    import hgvs.assemblymapper
    import hgvs.dataproviders.uta
    import hgvs.parser

    hp = hgvs.parser.Parser()
    hdp = hgvs.dataproviders.uta.connect()
    mapper = hgvs.assemblymapper.AssemblyMapper(
        hdp, assembly_name="GRCh37", normalize=False
    )

    results = []
    for tup in tqdm(validation_all_df.itertuples(), total=len(validation_all_df)):
        for var_c in map_g_to_c(tup.hgvs_g, mapper):
            results.append((tup.str_g, str(var_c)))
    validation_mapping_df = pd.DataFrame(results, columns=["hgvs_g", "hgvs"])
    pq.write_table(
        pa.Table.from_pandas(validation_mapping_df, preserve_index=False),
        validation_mapping_file,
    )

In [None]:
display(validation_mapping_df.head(2))
print(len(validation_mapping_df))

In [None]:
validation_df = (
    validation_all_df
    #
    .merge(validation_mapping_df, on=["hgvs_g"], how="left")
)

In [None]:
display(validation_df.tail(2))
print(len(validation_df))
print(len(validation_df["hgvs_g"].unique()))

## Load results

In [None]:
DATASET_NAME = "cagi6-sherloc"

DATASET_NAME

In [None]:
input_file = NOTEBOOK_DIR.parent.joinpath(
    "37_cagi6_sherloc_combine_results", "combined-results.parquet"
)

input_file

In [None]:
result_df = pq.read_table(input_file).to_pandas()

display(result_df.head(2))
print(len(result_df))

## Exploratory data analysis

In [None]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [None]:
result_df["effect"].value_counts()

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "rosetta_dg_change",
    #     "alphafold_core_scores_residue_plddt_wt",
    #     "alphafold_core_scores_protein_plddt_wt",
    #     "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    #     "alphafold_core_scores_proten_ptm_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
df = df[df["effect_score"] != 0].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

## Train ML model

### Prepare dataset

In [None]:
rosetta_columns = [c for c in result_df if "rosetta" in c]
msa_columns = [c for c in result_df if c.startswith("msa")]

len(rosetta_columns), len(msa_columns)

In [None]:
scalar_features = [
    #     "el2_score",
    "proteinsolver_core_score_wt",
    "proteinsolver_core_score_change",
    "protbert_core_score_wt",
    "protbert_core_score_change",
] + msa_columns
vector_features = [
    "aa_wt_onehot",
    "aa_mut_onehot",
    "protbert_core_features_residue_wt",
    "protbert_core_features_residue_change",
    "proteinsolver_core_features_residue_wt",
    "proteinsolver_core_features_residue_change",
    #     "protbert_core_features_protein_wt",
    #     "protbert_core_features_protein_change",
    #     "proteinsolver_core_features_protein_wt",
    #     "proteinsolver_core_features_protein_change",
    #
    # AlphaFold
    #     "alphafold_core_features_residue_experimentally_resolved_wt",  # 0.19 [37]
    #     "alphafold_core_features_residue_experimentally_resolved_change",  # 0.11 [37]
    #     "alphafold_core_features_residue_predicted_lddt_wt",  # 0.17 [50]
    #     "alphafold_core_features_residue_predicted_lddt_change",  # 0.04 [50]
    #     "alphafold_core_features_residue_msa_first_row_wt",  # 0.17 [256]
    #     "alphafold_core_features_residue_msa_first_row_change",  # 0.21 [256]
    #     "alphafold_core_features_residue_single_wt",  # 0.20 [384]
    #     "alphafold_core_features_residue_single_change",  # 0.15 [384]
    #     "alphafold_core_features_residue_structure_module_wt",  # 0.18 [384]
    #     "alphafold_core_features_residue_structure_module_change",  # 0.05 [384]
]

#             "rosetta_dg_change",
#             "alphafold_core_features_residue_experimentally_resolved_wt",

df = (
    result_df.dropna(
        subset=scalar_features
        + vector_features
        + [
            "effect_score",
        ]
    )
    .sample(frac=1, random_state=0)
    .reset_index(drop=True)
)
df = df[df["effect_score"] != 0].reset_index(drop=True)


X_ref = np.c_[
    df[scalar_features].values,
    np.hstack([np.vstack(df[col].values) for col in vector_features]),
]
X = X_ref
# X = X[:, important_features]

low_confidence_mask = df["effect_score"] == 0

y = (df["effect_score"] > 0).values.astype(int)
y[low_confidence_mask] = (df[low_confidence_mask]["el2_score"] > 2).values.astype(int)
# y = df["effect_score"].values

weights = np.ones(len(df), dtype=np.float64)
weights[df["effect_score"] == 1] = 0.5
weights[df["effect_score"] == -1] = 0.5
weights[low_confidence_mask] = 0.05

In [None]:
df["effect_score"].value_counts()

In [None]:
from collections import Counter

Counter(y)

### Hyperparameter optimization

In [None]:
def training_loop(X, y, weights, param, n_splits=6, progressbar=False):
    models = []
    preds = np.ones(len(y), dtype=np.float64) * np.nan
    gkf = GroupKFold(n_splits=n_splits)
    for train_index, test_index in tqdm(
        gkf.split(X, y, groups=df["protein_id"]),
        total=n_splits,
        disable=not progressbar,
    ):
        X_training, X_testing = X[train_index], X[test_index]
        y_training, y_testing = y[train_index], y[test_index]
        weights_training, weights_testing = weights[train_index], weights[test_index]

        dtrain = lgb.Dataset(X_training, label=y_training, weight=weights_training)
        model = lgb.train(param, dtrain)
        preds[test_index] = model.predict(X_testing)
        models.append(model)
    return models, preds

In [None]:
def objective(trial, X, y, weights, low_confidence_mask):
    param = {
        "objective": "binary",
        "metric": "auc",  # 'binary_logloss',
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 64),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 60),
    }
    models, preds = training_loop(X, y, weights, param)
    pred_labels = np.rint(preds)
    accuracy = metrics.accuracy_score(
        y[~low_confidence_mask], pred_labels[~low_confidence_mask]
    )
    auc = metrics.roc_auc_score(y[~low_confidence_mask], preds[~low_confidence_mask])
    return auc

In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(lambda trial: objective(trial, X, y, weights, low_confidence_mask), n_trials=100)

# print("Number of finished trials:", len(study.trials))
# print("Best trial:", study.best_trial.params)

In [None]:
# print("Number of finished trials:", len(study.trials))
# print("Best trial:", study.best_trial.params)

```json
Number of finished trials: 100
Best trial: {'lambda_l1': 5.233219055743764, 'lambda_l2': 0.00037944276593632404, 'num_leaves': 64, 'feature_fraction': 0.5395906681722488, 'bagging_fraction': 0.9467711664385872, 'bagging_freq': 5, 'min_child_samples': 15}

```

### Checkpoint model

In [None]:
param = {
    "objective": "binary",
    "metric": "auc",  # 'binary_logloss',
    "lambda_l1": 5.233219055743764,
    "lambda_l2": 0.00037944276593632404,
    "num_leaves": 64,
    "feature_fraction": 0.5395906681722488,
    "bagging_fraction": 0.9467711664385872,
    "bagging_freq": 5,
    "min_child_samples": 15,
}


models, preds = training_loop(X, y, weights, param, progressbar=True)

corr = stats.spearmanr(y[~low_confidence_mask], preds[~low_confidence_mask])
auc = metrics.roc_auc_score(y[~low_confidence_mask], preds[~low_confidence_mask])
print(corr[0], auc)

In [None]:
# Compare effect of weighting when using EL2 to label uncertain mutations
# (weights for: 1, -1, 0)
# 0.634985235753114 0.9065879781245562 (0.5, 0.5, 0.5)
# 0.6461637973791433 0.9137457331619272 (0.5, 0.5, 0.1)
# 0.6499893944121155 0.9161953047033986 (0.5, 0.5, 0.05) ★
# 0.6496451717121724 0.9159748951509965 (0.5, 0.5, 0.01)

### Feature elimination

In [None]:
def get_feature_index(num_features, features_to_exclude):
    idx = 0
    seen = set(features_to_exclude)
    indices = list(reversed(range(num_features)))
    while indices:
        idx = indices.pop()
        while idx in seen:
            idx += 1
        seen.add(idx)
        yield idx


list(get_feature_index(10, {1, 2, 3, 9, 10}))

In [None]:
features_to_exclude = {}

fe_round = -1
fe_round_stats = []
while len(features_to_exclude) < X_ref.shape[1]:
    fe_round += 1

    # Apply feature elimination mask
    feature_mask = np.ones(X_ref.shape[1], dtype=bool)
    feature_mask[np.array(list(features_to_exclude), dtype=int)] = False
    X = X_ref[:, feature_mask]

    # Re-train models
    models, preds = training_loop(X, y, weights, param, progressbar=True)

    # Calculate model statistics
    mask = df["effect_score"] != 0
    corr = stats.spearmanr(y[mask], preds[mask])
    auc = metrics.roc_auc_score(y[mask], preds[mask])

    # Save stats
    print(
        f"Performance at round {fe_round}:",
        corr[0],
        auc,
        len(features_to_exclude),
        X_ref.shape[1] - X.shape[1],
    )
    fe_round_stats.append((fe_round, corr[0], auc, len(features_to_exclude)))

    # Find new features to eliminate
    feature_importance_split = np.vstack(
        [model.feature_importance("split") for model in models]
    ).sum(axis=0)
    feature_importance_gain = np.vstack(
        [model.feature_importance("gain") for model in models]
    ).sum(axis=0)

    feature_df = pd.DataFrame(
        {
            "feature_idx": list(
                get_feature_index(len(feature_importance_split), features_to_exclude)
            ),
            "feature_importance_split": feature_importance_split,
            "feature_importance_gain": feature_importance_gain,
        }
    )

    min_split = feature_df["feature_importance_split"].min()
    feature_df = feature_df[feature_df["feature_importance_split"] == min_split]
    min_gain = feature_df["feature_importance_gain"].min()
    feature_df = feature_df[feature_df["feature_importance_gain"] == min_gain]
    new_features_to_exclude = feature_df["feature_idx"].values.tolist()

    for feature_idx in new_features_to_exclude:
        assert feature_idx not in features_to_exclude
        features_to_exclude[feature_idx] = fe_round

    break
#     if (feature_df["feature_importance_split"] == 0

In [None]:
(feature_importance_split == 0).sum()

### Checkpoint model

In [None]:
feature_df

In [None]:
# Play with weights
# # Remove 0 from training set
# 0.6445921081338579 0.9127408076311398
# 0.6449644074377229 0.9129791957737381 (weights: 0.5, 0.5) [0.6519079506754358 0.9174252378639102]
# 0.6445443685870316 0.9127102393712581 (weights: 0.4, 0.4)
# 0.6455538992488725 0.9133566551362649 (weights: 0.3, 0.3) [0.9171164733046693]
# 0.6452602748674968 0.9131686435817019 (weights: 0.2, 0.2)
# 0.6443443914762861 0.9125821913961814 (weights: 0.1, 0.1)


# # Keep 0 in training set
# 0.6279471936537289 0.9020828498944281
# 0.6443655135154522 0.9125957161157062 (weights: 1.0, 1.0, 0.05)
# 0.6472835979832177 0.9144642039931771 (weights: 1.0, 1.0, 0.01) ★
# 0.6460811745050736 0.9136942764200848 (weights: 0.5, 0.5, 0.01)
# 0.6440994599372241 0.9124253585091959 (weights: 0.5, 0.5, 0.05)

In [None]:
# Rows with Rosetta features
# 0.6511097888905689 0.9069517496895931  (no Rosetta)
# 0.6660185149966301 0.9162698896991949  (all Rosetta columns)

In [None]:
# Rows with AF features
# 0.6658121877163097 0.9006706783116625  (no AF)
# 0.6996884055147549 0.9210566180922061  (without AF changes)
# 0.7041584812816902 0.9237466083367306  (with AF changes)

In [None]:
# Rows with both AF and Rosetta features
# 0.6598501937428695 0.8865273801086971
# 0.6724664771013962 0.8939177529532045  (with Rosetta)
# 0.7044603810803847 0.9126591582020247  (with AF and Rostta)

In [None]:
prediction_df = result_df.dropna(subset=scalar_features + vector_features).copy()


X = np.c_[
    prediction_df[scalar_features].values,
    np.hstack([np.vstack(prediction_df[col].values) for col in vector_features]),
]


for model_idx, model in enumerate(models):
    prediction_df[f"pred_{model_idx}"] = model.predict(X)

prediction_df["pred"] = prediction_df[
    [f"pred_{model_idx}" for model_idx in range(len(models))]
].mean(axis=1)

In [None]:
validation_submission_df = validation_df.assign(
    is_missense=lambda df: df["hgvs"].notnull()
).merge(prediction_df, left_on=["hgvs"], right_on=["mutation_id"], how="left")

assert len(validation_submission_df) == len(validation_df)

out = (
    validation_submission_df.groupby("hgvs_g")
    .agg({"pred": np.nanmean, "is_missense": "max"})
    .fillna(0.0)
    .reset_index()
)

out["pred"] = out["pred"] / out["pred"].max()

out.head()

In [None]:
plt.hist(out[out["pred"] > 0]["pred"], bins=100)
None

In [None]:
out[["hgvs_g", "pred"]].to_csv(
    NOTEBOOK_DIR.joinpath("submission-valid-5.tsv"), index=False, sep="\t"
)

In [None]:
# 0.6516511403259807 0.9172607988182891 (without protein features)
# 0.6511556042045329 0.9169435005200913 (with protein features)

In [None]:
# 0.6253383274259249 0.9004123585248509

# 0.21795514450655545 0.8581850558176667
# regression all: 0.22461305628504954 0.8288936050495531

In [None]:
model

In [None]:
clf.importance_type

In [None]:
clf.feature_importances_

In [None]:
clf.booster_.feature_importance()

In [None]:
important_features = clf.booster_.feature_importance() > 0

In [None]:
from collections import Counter

import matplotlib.pyplot as plt

importantces = np.array(clf.booster_.feature_importance("gain"))
importantces = np.clip(importantces, -1, 100)

plt.hist(importantces, bins=100)
None

In [None]:
plt.plot(clf.feature_importances_, importantces, "r.")

In [None]:
import sys

paths = [
    "/gpfs/fs0/scratch/p/pmkim/strokach/workspace/shap-0.39.0",
    "/gpfs/fs0/scratch/p/pmkim/strokach/workspace/shap-hypetune",
    "/gpfs/fs0/scratch/p/pmkim/strokach/workspace/slicer-0.0.7",
]
for p in paths:
    if p not in sys.path:
        sys.path.insert(0, p)

from lightgbm import LGBMClassifier
from scipy import stats
from shaphypetune import BoostBoruta, BoostRFE, BoostSearch

In [None]:
param_grid = {
    "n_estimators": 150,
    "learning_rate": [0.2, 0.1],
    "num_leaves": [25, 30, 35],
    "max_depth": [10, 12],
}

model = BoostBoruta(
    LGBMClassifier(),
    param_grid=param_grid,
    max_iter=100,
    perc=100,
    importance_type="shap_importances",
    train_importance=False,
)
model.fit(
    X_training,
    y_training,
    eval_set=[(X_testing, y_testing)],
    early_stopping_rounds=6,
    verbose=0,
)

In [None]:
import lightgbm as lgb
from sklearn import model_selection
from sklearn.decomposition import PCA
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    #     "rosetta_dg_change",
    "alphafold_core_scores_residue_plddt_wt",
    #     "alphafold_core_scores_protein_plddt_wt",
    #     "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    #     "alphafold_core_scores_proten_ptm_wt",
]

col = "protbert_core_features_residue_change"

df = result_df.dropna(subset=score_columns + [col, "effect_score"]).sample(
    frac=1, random_state=0
)
df = df[df["effect_score"].isin([-1, 1])].reset_index(drop=True)

amino_acids = list("ARNDCEQGHILKMFPSTWYV")

df["aa_wt_onehot"] = pd.get_dummies(df["mutation"].str[0]).apply(list, axis=1)
df["aa_mut_onehot"] = pd.get_dummies(df["mutation"].str[0]).apply(list, axis=1)

df["preds"] = np.nan
df["pca_preds"] = np.nan

X = np.vstack(df[col].values)

Z = None
Z = np.c_[np.vstack(df["aa_wt_onehot"].values), np.vstack(df["aa_mut_onehot"].values)]
Z = np.c_[
    df[
        ["el2_score", "proteinsolver_core_score_change", "protbert_core_score_change"]
    ].values,
    Z,
]

y = (df["effect_score"] > 0).values.astype(int)


def select_features(X_training, X_testing):
    scaler = StandardScaler()
    scaler.fit(X_training)
    X_training = scaler.transform(X_training)
    X_testing = scaler.transform(X_testing)

    pca = PCA(0.9, svd_solver="full")
    pca.fit(X_training)
    X_training = pca.transform(X_training)
    X_testing = pca.transform(X_testing)
    return X_training, X_testing


params = dict()


n_splits = 6
gkf = GroupKFold(n_splits=n_splits)
for train_index, test_index in tqdm(
    gkf.split(X, y, groups=df["protein_id"]), total=n_splits
):
    X_training, X_testing = X[train_index], X[test_index]
    y_training, y_testing = y[train_index], y[test_index]
    if Z is not None:
        Z_training, Z_testing = Z[train_index], Z[test_index]
        X_training = np.c_[X_training, Z_training]
        X_testing = np.c_[X_testing, Z_testing]

    clf = lgb.LGBMClassifier(**params)
    clf.fit(X_training, y_training, eval_set=[(X_testing, y_testing)], verbose=False)
    df.loc[test_index, "preds"] = clf.predict_proba(X_testing)[:, 1]

    X_training, X_testing = select_features(X[train_index], X[test_index])
    y_training, y_testing = y[train_index], y[test_index]
    if Z is not None:
        Z_training, Z_testing = Z[train_index], Z[test_index]
        X_training = np.c_[X_training, Z_training]
        X_testing = np.c_[X_testing, Z_testing]

    clf = lgb.LGBMClassifier(**params)
    clf.fit(X_training, y_training, eval_set=[(X_testing, y_testing)], verbose=False)
    df.loc[test_index, "pca_preds"] = clf.predict_proba(X_testing)[:, 1]

corr = stats.spearmanr(df["effect_score"], df["preds"])
auc = metrics.roc_auc_score(df["effect_score"] > 0, df["preds"])
print(corr[0], auc)

corr = stats.spearmanr(df["effect_score"], df["pca_preds"])
auc = metrics.roc_auc_score(df["effect_score"] > 0, df["pca_preds"])
print(corr[0], auc)

In [None]:
# 0.6585175611306802 0.88576012420753
# 0.6547985025461474 0.8835814968157445

In [None]:
evals = []
for i in tqdm(range(X_training.shape[1])):
    x1 = X_training[:, i]

    corr = stats.spearmanr(df["effect_score"], x1)
    auc = metrics.roc_auc_score(df["effect_score"] > 0, x1)
    evals.append((i, *corr, auc))

evals_df = pd.DataFrame(evals, columns=["i", "corr", "pvalue", "auc"])
evals_df["corr_abs"] = evals_df["corr"].abs()
evals_df = evals_df.sort_values("corr_abs", ascending=False)
del evals_df["corr_abs"]

display(evals_df.head(10))

In [None]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=0
)

In [None]:
train_result_df.head()

In [None]:
import matplotlib.pyplot as plt

plt.hist(x1, bins=100)
plt.hist(x2, bins=100)

None

In [None]:
x = df["alphafold_core_features_residue_experimentally_resolved_wt"].apply(np.mean)

corr = stats.spearmanr(df["effect_score"], x)
auc = metrics.roc_auc_score(df["effect_score"] > 0, x)

corr, auc

In [None]:
import lightgbm as lgb

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

col = "alphafold_core_features_residue_msa_first_row_change"

x = np.vstack(df[col].values)
x = StandardScaler().fit_transform(x)
x = PCA(0.9, svd_solver="full").fit_transform(x)

evals = []
for i in tqdm(range(x.shape[1])):
    x1 = x[:, i]

    corr = stats.spearmanr(df["effect_score"], x1)
    auc = metrics.roc_auc_score(df["effect_score"] > 0, x1)
    evals.append((i, *corr, auc))

evals_df = pd.DataFrame(evals, columns=["i", "corr", "pvalue", "auc"])
evals_df["corr_abs"] = evals_df["corr"].abs()
evals_df = evals_df.sort_values("corr_abs", ascending=False)
del evals_df["corr_abs"]

display(evals_df.head(10))

In [None]:
x.shape

In [None]:
# col = "alphafold_core_features_residue_experimentally_resolved_wt"  # 0.19 [37]
# col = "alphafold_core_features_residue_experimentally_resolved_change"  # 0.11 [37]
# col = "alphafold_core_features_residue_predicted_lddt_wt"  # 0.17 [50]
# col = "alphafold_core_features_residue_predicted_lddt_change"  # 0.04 [50]
# col = "alphafold_core_features_residue_msa_first_row_wt"  # 0.17 [256]
# col = "alphafold_core_features_residue_msa_first_row_change"  # 0.21 [256]
# col = "alphafold_core_features_residue_single_wt"  # 0.20 [384]
# col = "alphafold_core_features_residue_single_change"  # 0.15 [384]
# col = "alphafold_core_features_residue_structure_module_wt"  # 0.18 [384]
col = "alphafold_core_features_residue_structure_module_change"  # 0.05 [384]

col = "protbert_core_features_residue_change"  # 0.13 [1024]

# col = "alphafold_core_features_protein_experimentally_resolved_wt"  #
# col = "alphafold_core_features_protein_predicted_lddt_wt"  #
# col = "alphafold_core_features_protein_msa_first_row_wt"  #
# col = "alphafold_core_features_protein_single_wt"  #
# col = "alphafold_core_features_protein_structure_module_wt"  #

num_features = len(result_df[col][0])

evals = []
for i in tqdm(range(num_features)):
    x1 = df[col].str[i]

    corr = stats.spearmanr(df["effect_score"], x1)
    auc = metrics.roc_auc_score(df["effect_score"] > 0, x1)
    evals.append((i, *corr, auc))

evals_df = pd.DataFrame(evals, columns=["i", "corr", "pvalue", "auc"])
evals_df["corr_abs"] = evals_df["corr"].abs()
evals_df = evals_df.sort_values("corr_abs", ascending=False)
del evals_df["corr_abs"]

display(evals_df.head(10))

In [None]:
scores_proten_ptm

In [None]:
validation_df.head()

In [None]:
validation_submission_df = validation_df.assign(
    is_missense=lambda df: df["hgvs"].notnull()
).merge(result_df, left_on=["hgvs"], right_on=["mutation_id"], how="left")

assert len(validation_submission_df) == len(validation_df)

In [None]:
out = (
    validation_submission_df.groupby("hgvs_g")
    .agg({"el2_score": np.nanmean, "is_missense": "max"})
    .fillna(0.0)
    .reset_index()
)

out.head()

In [None]:
out[["hgvs_g", "el2_score"]].to_csv(
    NOTEBOOK_DIR.joinpath("submission-valid.tsv"), index=False, sep="\t"
)

In [None]:
validation_submission_df.head()

In [None]:
df.columns