In [1]:
import pandas as pd
import numpy as np
from time import perf_counter
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from skopt.space import Real, Integer, Categorical

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
i = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)

In [5]:
rand = np.random.RandomState()

# Load and filter survival data

In [6]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["figo_stage", "age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity", "figo_chr"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [7]:
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="c")
        .query("vital_status == 1")
        .drop(["vital_status"], axis=1)
        .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')

print(filtered_survival_df.shape)
# filtered_survival_df.head()

(66, 16)


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)
print(norm_filtered_matrisome_counts_t_df.shape)
# norm_filtered_matrisome_counts_t_df.head()

(66, 1009)


# Join survival and count data

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
print(joined_df.shape)
# joined_df.head()

(66, 1023)


In [61]:
rand.seed(123)
shuffled_df = joined_df.sample(frac=1, random_state=rand)
x_df = shuffled_df.iloc[:, 1:]
y_df = shuffled_df.iloc[:, [0]]

# Create models from saved optimizer results

In [62]:
svr_h_param_df = pd.read_csv("opt_svr_h_params.tsv", sep="\t")
svr_kwargs = pd.Series(svr_h_param_df.param_value[:-1].values, index=svr_h_param_df.param[:-1]).to_dict()
svr = SVR(
    kernel=svr_kwargs["kernel"],
    C=float(svr_kwargs["C"]),
    epsilon=float(svr_kwargs["epsilon"]),
    gamma=float(svr_kwargs["gamma"]),
    degree=int(svr_kwargs["degree"]),
    coef0=float(svr_kwargs["coef0"])
)
svr_pipeline = make_pipeline(
    ColumnTransformer([
        ("standard", StandardScaler(), ["age_at_diagnosis"] + list(norm_filtered_matrisome_counts_t_df.columns[1:]))
    ], remainder="passthrough"),
    svr
)
svr_ttr = TransformedTargetRegressor(regressor=svr_pipeline, transformer=StandardScaler())

In [68]:
svr_h_param_df

Unnamed: 0,param,param_value
0,kernel,rbf
1,C,0.728210947014468
2,epsilon,0.049953706678984154
3,gamma,0.001
4,degree,3
5,coef0,-0.5502675292350878
6,loss_achieved,495.5320350219823


# Cross validated feature permutation importance

In [42]:
def cv_permutation_importance(estimator, X, y ,k):
    kf = KFold(n_splits=k, random_state=rand)
    results = []
    for train_idx, test_idx in kf.split(X):
        # Train/retrain from scratch
        estimator.fit(X[train_idx], y[train_idx])
        result = permutation_importance(pipeline, X[test_idx], y[test_idx], scoring="f1_weighted", n_jobs=-1, n_repeats=5)
        results.append(result)
    return results