In [1]:
import joblib

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from predictability.models import RITARegressor
from predictability.utils import sel_kfold
from predictability.constants import BINARY_RESIDUE_FEATURES, PROJECT_ROOT, DATA_ROOT

In [2]:
results_dir = PROJECT_ROOT / "results/erk2/singles/rita"
results_dir.mkdir(exist_ok=True)

# Loading data

In [5]:
data = pd.read_csv(DATA_ROOT / "erk2/singles.csv")

In [7]:
rita_model = RITARegressor()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [8]:
# Uncomment to generate embeddings
embeddings = rita_model.embed(data)
sequence_embeddings = {sequence: embedding for sequence, embedding in zip(data["sequence"], embeddings)}
joblib.dump(sequence_embeddings, DATA_ROOT / "erk2/single_sequence_embeddings.joblib")

embeddings = joblib.load(DATA_ROOT / "erk2/single_sequence_embeddings.joblib")
embeddings = np.vstack(embeddings.values())
data["embedding_index"] = np.arange(len(data))

# Running models

In [10]:
property = "DMS_score"
experiment_results_dir = results_dir
experiment_results_dir.mkdir(exist_ok=True, parents=True)
experiment_results = {
    "fold": [],
    "feature": [],
    "belongs": [],
    "spearman_val": [],
    "model": []
}
for feature in BINARY_RESIDUE_FEATURES:
    print(f"Feature: {feature}")
    for belongs in [0, 1]:
        df = data[data[feature]==belongs].dropna(subset=property).reset_index()
        # K-fold cross-val, reassign splits
        print("Belongs:", ["no", "yes"][belongs])
        print(f"Total samples: {len(df)}")
        for i, (train_inds, test_inds) in enumerate(sel_kfold(df, position_col="residue_number", k=10)):
            split_ids = np.empty(len(df), dtype=object)
            split_ids[train_inds] = "train"
            split_ids[test_inds] = "valid"
            df["split"] = split_ids
            train_embeddings_indices = df[df["split"] == "train"]["embedding_index"].values
            fold_train_embeddings = embeddings[train_embeddings_indices]
            valid_embeddings_indices = df[df["split"] == "valid"]["embedding_index"].values
            fold_valid_embeddings = embeddings[valid_embeddings_indices]
            rita_model.fit(df[df["split"] == "train"], property, embeddings=fold_train_embeddings)
            predictions_val = rita_model.predict(df[df["split"] == "valid"], embeddings=fold_valid_embeddings)
            spearman_val = spearmanr(df[df["split"] == "valid"][property].values, predictions_val)[0]
            experiment_results["fold"].append(i)
            experiment_results["feature"].append(feature)
            experiment_results["belongs"].append(belongs)
            experiment_results["spearman_val"].append(spearman_val)
            experiment_results["model"].append("RITARegressor")

2023-09-01 11:57:47.543 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Feature: is_buried
Belongs: no
Total samples: 3358


2023-09-01 11:57:48.229 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:49.093 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:49.972 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:50.771 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:51.638 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:52.362 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:53.155 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:53.932 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:54.704 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:55.754 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Belongs: yes
Total samples: 3357


2023-09-01 11:57:56.533 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:57.264 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:57.992 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:58.727 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:57:59.456 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:00.172 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:00.957 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:01.692 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:02.394 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:03.227 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Feature: is_connected
Belongs: no
Total samples: 3870


2023-09-01 11:58:04.144 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:05.167 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:06.206 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:07.132 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:08.059 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:08.835 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:09.728 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:10.772 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:11.584 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:12.650 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Belongs: yes
Total samples: 2845


2023-09-01 11:58:13.222 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:13.843 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:14.345 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:14.938 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:15.532 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:16.147 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:16.687 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:17.262 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:17.847 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:18.660 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Feature: is_close_to_as
Belongs: no
Total samples: 3360


2023-09-01 11:58:19.628 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:20.442 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:21.188 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:21.893 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:22.624 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:23.325 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:24.068 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:24.763 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:25.507 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Belongs: yes
Total samples: 3355


2023-09-01 11:58:26.457 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:27.121 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:27.852 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:28.585 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:29.374 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:30.086 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:30.846 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:31.572 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:32.304 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:33.014 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:33.787 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Feature: is_secondary
Belongs: no
Total samples: 2735


2023-09-01 11:58:34.301 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:34.867 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:35.463 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:35.988 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:36.533 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:37.074 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:37.611 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:38.158 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:38.676 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:39.471 | INFO     | unpredictability.models:fit:197 - Fitting ridge


Belongs: yes
Total samples: 3980


2023-09-01 11:58:40.403 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:41.749 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:42.720 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:43.753 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:44.785 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:45.779 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:46.695 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:47.578 | INFO     | unpredictability.models:fit:197 - Fitting ridge
2023-09-01 11:58:48.664 | INFO     | unpredictability.models:fit:197 - Fitting ridge


In [None]:
results_df = pd.DataFrame(experiment_results)
results_df.to_csv(experiment_results_dir / "results.csv")