In [2]:
from pathlib import Path
import joblib

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from predictability.models import RITARegressor
from predictability.utils import sel_kfold
from predictability.constants import BINARY_RESIDUE_FEATURES, PROJECT_ROOT, DATA_ROOT

In [3]:
results_dir = Path(PROJECT_ROOT / "results/amylase/singles/rita")
results_dir.mkdir(exist_ok=True, parents=True)
seed = 42

In [4]:
data = pd.read_csv(DATA_ROOT / "amylase/singles.csv")
property = "stain_activity"

In [5]:
rita_model = RITARegressor()

In [6]:
# Uncomment to generate embeddings
embeddings = rita_model.embed(data)
sequence_embeddings = {sequence: embedding for sequence, embedding in zip(data["sequence"], embeddings)}
joblib.dump(sequence_embeddings, DATA_ROOT / "amylase/single_sequence_embeddings.joblib")

embeddings = joblib.load(DATA_ROOT / "amylase/single_sequence_embeddings.joblib")
embeddings = np.vstack(embeddings.values())
data["embedding_index"] = np.arange(len(data))

8500it [24:23:29, 10.33s/it]  
  embeddings = np.vstack(embeddings.values())


In [6]:
experiment_results = {
    "fold": [],
    "feature": [],
    "belongs": [],
    "spearman_val": [],
    "model": [],
}
prediction_dfs = []
for feature in BINARY_RESIDUE_FEATURES:
    for belongs in [0, 1]:
        df = data[data[feature]==belongs].reset_index()
        for i, (train_inds, test_inds) in enumerate(sel_kfold(df, position_col="residue_number", k=10)):
            split_ids = np.empty(len(df), dtype=object)
            split_ids[train_inds] = "train"
            split_ids[test_inds] = "valid"
            df["split"] = split_ids
            train_embeddings_indices = df[df["split"] == "train"]["embedding_index"].values
            fold_train_embeddings = embeddings[train_embeddings_indices]
            valid_embeddings_indices = df[df["split"] == "valid"]["embedding_index"].values
            fold_valid_embeddings = embeddings[valid_embeddings_indices]
            rita_model.fit(df[df["split"] == "train"], property, embeddings=fold_train_embeddings)
            predictions_val = rita_model.predict(df[df["split"] == "valid"], embeddings=fold_valid_embeddings)
            spearman_val = spearmanr(df[df["split"] == "valid"][property].values, predictions_val)[0]
            experiment_results["fold"].append(i)
            experiment_results["feature"].append(feature)
            experiment_results["belongs"].append(belongs)
            experiment_results["spearman_val"].append(spearman_val)
            experiment_results["model"].append("RITARegressor")

[32m2023-09-27 09:38:11.435[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36mfit[0m:[36m182[0m - [1mFitting ridge[0m
[32m2023-09-27 09:38:11.853[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36mfit[0m:[36m182[0m - [1mFitting ridge[0m
[32m2023-09-27 09:38:12.315[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36mfit[0m:[36m182[0m - [1mFitting ridge[0m
[32m2023-09-27 09:38:12.764[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36mfit[0m:[36m182[0m - [1mFitting ridge[0m
[32m2023-09-27 09:38:13.238[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36mfit[0m:[36m182[0m - [1mFitting ridge[0m
[32m2023-09-27 09:38:13.691[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36mfit[0m:[36m182[0m - [1mFitting ridge[0m
[32m2023-09-27 09:38:14.153[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36mfit[0m:[36m182[0m - [1mFitting ridge[0m
[32m2023-09-27 09:38:14.617[0m | [1mINFO    [0m | 

In [9]:
results_df = pd.DataFrame(experiment_results)
results_df.to_csv(results_dir / "scores.csv")