In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from predictability.models import ResidueAgnosticRegressor
from predictability.utils import sel_kfold, read_fasta
from predictability.constants import BINARY_RESIDUE_FEATURES, PROJECT_ROOT, DATA_ROOT

In [2]:
results_dir = Path(PROJECT_ROOT / "results/erk2/singles/rar")
results_dir.mkdir(exist_ok=True, parents=True)
seed = 42

In [3]:
property = "DMS_score"
data = pd.read_csv(DATA_ROOT / "erk2/singles.csv").dropna(subset=BINARY_RESIDUE_FEATURES + [property])
reference = [sequence for key, sequence in read_fasta(DATA_ROOT / "erk2/reference.fasta").items()][0]

In [4]:
model = ResidueAgnosticRegressor(wildtype_sequence=reference)

In [5]:
experiment_results = {
    "fold": [],
    "feature": [],
    "belongs": [],
    "spearman_val": [],
    "model": [],
}
prediction_dfs = []
for feature in BINARY_RESIDUE_FEATURES:
    for belongs in [0, 1]:
        df = data[data[feature]==belongs].reset_index()
        for i, (train_inds, test_inds) in enumerate(sel_kfold(df, position_col="residue_number", k=10)):
            split_ids = np.empty(len(df), dtype=object)
            split_ids[train_inds] = "train"
            split_ids[test_inds] = "valid"
            df["split"] = split_ids
            model.fit(df[df["split"] == "train"], property)
            predictions_val = model.predict(df[df["split"] == "valid"])
            spearman_val = spearmanr(df[df["split"] == "valid"][property].values, predictions_val)[0]
            experiment_results["fold"].append(i)
            experiment_results["feature"].append(feature)
            experiment_results["belongs"].append(belongs)
            experiment_results["spearman_val"].append(spearman_val)
            experiment_results["model"].append("ResidueAgnosticRegressor")

In [6]:
results_df = pd.DataFrame(experiment_results)
results_df.to_csv(results_dir / "scores.csv")