In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"
seed = 123
rand = np.random.RandomState()
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["figo_stage", "age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity", "figo_chr"]

In [3]:
dset_idx = 1

In [4]:
# Load optimal hyperparameters
h_params_df = pd.read_csv(f"{dirs.analysis_dir}/model_opt/{unified_dsets[dset_idx]}_opt_plsr_h_params_neg_mean_squared_error.tsv", sep="\t")
n_comp = h_params_df.n_components[0]

# Load and filter survival data
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/survival_data.tsv", event_code)
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="c")
        .query("vital_status == 1")
        .drop(["vital_status"], axis=1)
        .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')

# Load normalized matrisome count data
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)

# Combine survival data and normalized count data
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)

rand.seed(seed)
x_df, y_df = prep.shuffle_data(joined_df, rand)
x = np.array(x_df)
y = np.array(y_df)

gene_names = list(norm_filtered_matrisome_counts_t_df.columns[1:])


# Model evaluation

In [5]:
kf = KFold(n_splits=5, random_state=None, shuffle=False)
kf.get_n_splits(x_df)

5

In [6]:
kf.n_splits

5

In [7]:
r_scores = np.zeros(shape=kf.n_splits, dtype = np.float64)
q_scores = np.zeros(shape=kf.n_splits, dtype = np.float64)

for i, (train_idx, test_idx) in enumerate(kf.split(x_df)):
    x_train = x[train_idx]
    y_train = y[train_idx]
    
    x_test = x[test_idx]
    y_test = y[test_idx]
    
    plsr_model = PLSRegression(scale=False, n_components=n_comp)
    plsr_model.fit(x_train, y_train)
    
    r = plsr_model.score(x_train, y_train)
    q = plsr_model.score(x_test, y_test)
    r_scores[i] = r
    q_scores[i] = q

In [8]:
print(r_scores)
print(q_scores)
print(np.round(np.mean(r_scores), decimals=2))
print(np.round(np.mean(q_scores), decimals=2))


[0.93701109 0.94001008 0.8998418  0.92516825 0.8662635 ]
[ -0.67872135  -0.61841861 -12.15723441  -0.5353383   -4.62223188]
0.91
-3.72
