In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from predictability.models import PottsRegressor
from predictability.constants import BINARY_RESIDUE_FEATURES, PROJECT_ROOT, DATA_ROOT
from predictability.utils import update_environment_variables, sel_kfold

In [2]:
# Only run this when jupyter doesn't read EVs
update_environment_variables("zsh")

In [3]:
results_dir = PROJECT_ROOT / "results/erk2/singles/potts"
results_dir.mkdir(exist_ok=True, parents=True)

# Loading data

In [6]:
data = pd.read_csv(DATA_ROOT / "erk2/singles.csv").dropna()

In [9]:
potts_model = PottsRegressor(msa_path=str(DATA_ROOT / "erk2/msa.a2m"))

2023-09-01 11:43:08.253 | INFO     | unpredictability.models:__init__:147 - Loading Potts model locally from: /Users/floris/work/phd/projects/protein-engineering-benchmark/notebooks/results/unpredictability/SEL/mk01_human/potts_regressor


In [10]:
property = "DMS_score"
experiment_results_dir = results_dir
experiment_results_dir.mkdir(exist_ok=True, parents=True)
experiment_results = {
    "fold": [],
    "feature": [],
    "belongs": [],
    "spearman_val": [],
    "model": []
}
for feature in BINARY_RESIDUE_FEATURES:
    for belongs in [0, 1]:
        df = data[data[feature]==belongs].dropna(subset=property).reset_index()
        print(feature)
        for i, (train_inds, test_inds) in enumerate(sel_kfold(df, position_col="residue_number", k=10)):
            print(f"Fold: {i}")
            split_ids = np.empty(len(df), dtype=object)
            split_ids[train_inds] = "train"
            split_ids[test_inds] = "valid"
            df["split"] = split_ids
            potts_model.fit(df[df["split"] == "train"], property)
            predictions = potts_model.predict(df[df["split"] == "valid"])
            spearman = spearmanr(df[df["split"] == "valid"][property].values, predictions)[0]
            experiment_results["fold"].append(i)
            experiment_results["feature"].append(feature)
            experiment_results["belongs"].append(belongs)
            experiment_results["spearman_val"].append(spearman)
            experiment_results["model"].append("PottsRegressor")

is_buried
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
is_buried
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
is_connected
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
is_connected
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
is_close_to_as
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
is_close_to_as
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
is_secondary
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
is_secondary
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9


In [13]:
results_df = pd.DataFrame(experiment_results)
results_df.to_csv(experiment_results_dir / "results.csv")