In [1]:
from pathlib import Path
import joblib

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from sklearn.model_selection import KFold

from predictability.models import RITARegressor
from predictability.constants import PROJECT_ROOT, DATA_ROOT

In [2]:
results_dir = Path(PROJECT_ROOT / "results/AMY_BACSU/feature_utilization/potts")
results_dir.mkdir(exist_ok=True, parents=True)
seed = 42

In [None]:
data = pd.read_csv(DATA_ROOT / "AMY_BACSU/combinatorials.csv")
property = "stain_activity"

In [3]:
# Uncomment to generate embeddings
#embeddings = rita_model.embed(data)
#sequence_embeddings = {sequence: embedding for sequence, embedding in zip(data["sequence"], embeddings)}
#joblib.dump(sequence_embeddings, DATA_ROOT / "AMY_BACSU/combinatorial_sequence_embeddings.joblib")

embeddings = joblib.load(DATA_ROOT / "AMY_BACSU/combinatorial_sequence_embeddings.joblib")
embeddings = np.vstack(embeddings.values())
data["embedding_index"] = np.arange(len(data))

  embeddings = np.vstack(embeddings.values())


In [4]:
rita_model = RITARegressor()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


# Eval on held out bins

In [9]:
experiment_results_extrapolate = {
    "n_val_samples": [],
    "spearman_train": [],
    "spearman_val": [],
    "eval": []
}
df = data

for bin in df["bin_label"].unique():
    df["split"] = df["bin_label"].map(lambda x: "valid" if x==bin else "train")
    train_embeddings_indices = df[df["split"] == "train"]["embeding_index"].values
    fold_train_embeddings = embeddings[train_embeddings_indices]
    valid_embeddings_indices = df[df["split"] == "valid"]["embeding_index"].values
    fold_valid_embeddings = embeddings[valid_embeddings_indices]
    rita_model.fit(df[df["split"] == "train"], property, embeddings=fold_train_embeddings)
    predictions_train = rita_model.predict(df[df["split"] == "train"], embeddings=fold_train_embeddings)
    predictions_val = rita_model.predict(df[df["split"] == "valid"], embeddings=fold_valid_embeddings)
    spearman_train = spearmanr(df[df["split"] == "train"][property].values, predictions_train)[0]
    spearman_val = spearmanr(df[df["split"] == "valid"][property].values, predictions_val)[0]
    experiment_results_extrapolate["n_val_samples"].append(len(df[df["split"] == "valid"]))
    experiment_results_extrapolate["spearman_train"].append(spearman_train)
    experiment_results_extrapolate["spearman_val"].append(spearman_val)
    experiment_results_extrapolate["eval"] = "extrapolate"

2023-06-13 15:37:33.564 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:34.173 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:34.877 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:35.628 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:36.437 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:37.147 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:37.872 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:38.695 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:39.466 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:40.393 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:41.153 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:41.913 | INFO     | unpredictability.models:fit:82 - Fittin

# Eval on randomly held out data

In [10]:
experiment_results_random = {
    "n_val_samples": [],
    "spearman_train": [],
    "spearman_val": [],
    "eval": []
}
df = data

kfold = KFold(n_splits=16, random_state=42, shuffle=True)
kfold.get_n_splits(df)
print(kfold)

for i, (train_index, val_index) in enumerate(kfold.split(df)):
    df.loc[train_index, "split"] = "train"
    df.loc[val_index, "split"] = "valid"
    train_embeddings_indices = df[df["split"] == "train"]["embeding_index"].values
    fold_train_embeddings = embeddings[train_embeddings_indices]
    valid_embeddings_indices = df[df["split"] == "valid"]["embeding_index"].values
    fold_valid_embeddings = embeddings[valid_embeddings_indices]
    rita_model.fit(df[df["split"] == "train"], property, embeddings=fold_train_embeddings)
    predictions_train = rita_model.predict(df[df["split"] == "train"], embeddings=fold_train_embeddings)
    predictions_val = rita_model.predict(df[df["split"] == "valid"], embeddings=fold_valid_embeddings)
    spearman_train = spearmanr(df[df["split"] == "train"][property].values, predictions_train)[0]
    spearman_val = spearmanr(df[df["split"] == "valid"][property].values, predictions_val)[0]
    experiment_results_random["n_val_samples"].append(len(df[df["split"] == "valid"]))
    experiment_results_random["spearman_train"].append(spearman_train)
    experiment_results_random["spearman_val"].append(spearman_val)
    experiment_results_random["eval"] = "random"

2023-06-13 15:37:46.500 | INFO     | unpredictability.models:fit:82 - Fitting ridge


KFold(n_splits=16, random_state=42, shuffle=True)


2023-06-13 15:37:47.232 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:47.970 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:48.734 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:49.550 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:50.285 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:51.082 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:51.831 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:52.593 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:53.313 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:54.039 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:54.804 | INFO     | unpredictability.models:fit:82 - Fitting ridge
2023-06-13 15:37:55.561 | INFO     | unpredictability.models:fit:82 - Fittin

In [11]:
results_df = pd.concat([pd.DataFrame(experiment_results_extrapolate), pd.DataFrame(experiment_results_random)])

In [12]:
results_df.to_csv(results_dir / "scores.csv")