In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

from predictability.models import PottsRegressor
from predictability.utils import update_environment_variables, sel_kfold
from predictability.constants import BINARY_RESIDUE_FEATURES, PROJECT_ROOT, DATA_ROOT

In [2]:
# Only necessary when jupyter does not read EVs, replace zsh with your shell
update_environment_variables("zsh")

In [3]:
results_dir = Path(PROJECT_ROOT / "results/amylase/singles/potts")
results_dir.mkdir(exist_ok=True, parents=True)
seed = 42

In [4]:
potts_model = PottsRegressor(msa_path=str(DATA_ROOT / "amylase/msa.a3m"))

[32m2023-09-26 20:50:08.420[0m | [1mINFO    [0m | [36mpredictability.models[0m:[36m__init__[0m:[36m125[0m - [1mRunning Gremlin locally and saving emission parameters[0m


# ---------------------------------------------------------------------------------------------
#                                GREMLIN_CPP v1.0                                              
# ---------------------------------------------------------------------------------------------
#   -i           /Users/floris/work/phd/projects/mutation-predictability/data/amylase/msa.a3m
#   -o           /var/folders/d8/bxgj52rj3w934zctbjr8v2hh0000gn/T/tmp14t4uzky/ouputs
# ---------------------------------------------------------------------------------------------
#   -only_neff   0
#   -only_v      0
#   -gap_cutoff  1
#   -alphabet    protein
#   -eff_cutoff  0.8
#   -lambda      0.01
#   -mrf_o       /var/folders/d8/bxgj52rj3w934zctbjr8v2hh0000gn/T/tmp14t4uzky/output.mrf
# ---------------------------------------------------------------------------------------------
#   -min_type    lbfgs
#   -max_iter    100
# ---------------------------------------------------------------------------------

In [5]:
property = "activity_dp3"
data = pd.read_csv(DATA_ROOT / "amylase/singles.csv").dropna(subset=BINARY_RESIDUE_FEATURES + [property])

In [6]:
seed = 42

In [7]:
experiment_results = {
    "fold": [],
    "feature": [],
    "belongs": [],
    "spearman_val": [],
    "model": []
}
prediction_dfs = []
for feature in BINARY_RESIDUE_FEATURES:
    for belongs in [0, 1]:
        df = data[data[feature]==belongs].reset_index()
        for i, (train_inds, test_inds) in enumerate(sel_kfold(df, position_col="residue_number", k=10)):
            split_ids = np.empty(len(df), dtype=object)
            split_ids[train_inds] = "train"
            split_ids[test_inds] = "valid"
            df["split"] = split_ids
            potts_model.fit(df[df["split"] == "train"], property)
            predictions_val = potts_model.predict(df[df["split"] == "valid"])
            spearman_val = spearmanr(df[df["split"] == "valid"][property].values, predictions_val)[0]
            experiment_results["fold"].append(i)
            experiment_results["feature"].append(feature)
            experiment_results["belongs"].append(belongs)
            experiment_results["spearman_val"].append(spearman_val)
            experiment_results["model"].append("PottsRegressor")

In [10]:
results_df = pd.DataFrame(experiment_results)
results_df.to_csv(results_dir / "scores.csv")