In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.model_selection import KFold

from predictability.models import PartialLeastSquares
from predictability.constants import BINARY_RESIDUE_FEATURES, PROJECT_ROOT, DATA_ROOT

In [2]:
results_dir = Path(PROJECT_ROOT / "results/amylase/combinatorials/pls")
results_dir.mkdir(exist_ok=True, parents=True)
seed = 42

In [3]:
data = pd.read_csv(DATA_ROOT / "amylase/combinatorials.csv")
property = "stain_activity"

In [4]:
data

Unnamed: 0.1,Unnamed: 0,sequence,stain_activity,sampler,mutations,is_buried,is_connected,is_close_to_as,is_secondary
0,0,LTAPSIKSGVILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.006936,lm,T010V-T103P-I209W,True,True,True,False
1,1,LTAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.139217,lm,A098V,True,False,True,True
2,2,LTAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.017767,random,R296H-P307Q-G326N,True,False,False,False
3,3,LTAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.025117,lm,R133D-R164K,False,False,True,True
4,4,LTAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.009967,lm,W130Y-L141W-G143S-Y252W,True,False,True,False
...,...,...,...,...,...,...,...,...,...
3701,3701,LTAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.004417,lm,V099I-E182G-Q211N-Y252R,True,False,True,False
3702,3702,LTAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.131417,random,P424S,False,False,False,False
3703,3703,LTAPSIKSGTQLHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.005050,random,I011Q-A178Q,True,True,True,True
3704,3704,LTAPSIKSGTILHAWNWSFNTLKHNMKDIHDAGYTAIQTSPINQVK...,0.029917,lm,F161V-H233Q,True,False,True,True


In [5]:
experiment_results = {
    "fold": [],
    "feature": [],
    "belongs": [],
    "spearman_val": [],
    "model": [],
}
prediction_dfs = []
for feature in BINARY_RESIDUE_FEATURES:
    for belongs in [0, 1]:
        df = data[data[feature]==belongs].reset_index()
        k_fold = KFold(n_splits=10, shuffle=True, random_state=seed)
        for i, (train_inds, test_inds) in enumerate(k_fold.split(np.arange(len(df)))):
            split_ids = np.empty(len(df), dtype=object)
            split_ids[train_inds] = "train"
            split_ids[test_inds] = "valid"
            df["split"] = split_ids
            model = PartialLeastSquares()
            model.fit(df[df["split"] == "train"], property)
            predictions_val = model.predict(df[df["split"] == "valid"])
            spearman_val, _ = spearmanr(df[df["split"] == "valid"][property].values, predictions_val)
            experiment_results["fold"].append(i)
            experiment_results["feature"].append(feature)
            experiment_results["belongs"].append(belongs)
            experiment_results["spearman_val"].append(spearman_val)
            experiment_results["model"].append("PLS")

In [6]:
results_df = pd.DataFrame(experiment_results)
results_df.to_csv(results_dir / "scores.csv")