In [1]:
import pandas as pd
import numpy as np
from time import perf_counter
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from skopt.space import Real, Integer
from skopt import gp_minimize

import utils.dev_config as dev_conf
import utils.preprocessing as prep

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
i = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)
sig_deg_df = pd.read_csv(f"{dirs.analysis_dir}/{unified_dsets[i]}_sig_DESeq_results_xref_matrisome.tsv", sep = '\t')
matrisome_sig_deg_df = (
    sig_deg_df.query("in_matrisome == True")
        .reset_index(drop=True)
)

# Load and filter survival data

In [5]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "bmi", "race", "ethnicity"]
dep_cols = ["vital_status", "survival_time"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [6]:
filtered_survival_df = (
    survival_df[["sample_name"] + dep_cols + covariate_cols]
        .query("vital_status == 1")
        .dropna()
        .reset_index(drop=True)
        .pipe(pd.get_dummies, columns=cat_cols)
)
print(filtered_survival_df.shape)
print(filtered_survival_df.shape[0] / survival_df.shape[0])
filtered_survival_df.head()

(48, 13)
0.18532818532818532


Unnamed: 0,sample_name,vital_status,survival_time,age_at_diagnosis,bmi,race_american indian or alaska native,race_asian,race_black or african american,race_not reported,race_white,ethnicity_hispanic or latino,ethnicity_not hispanic or latino,ethnicity_not reported
0,TCGA-C5-A2LZ-01A-11R-A213-07,1,3046,24059.0,31.992171,0,0,0,0,1,0,1,0
1,TCGA-VS-A9V1-01A-11R-A42T-07,1,157,17001.0,18.730489,0,0,0,0,1,0,0,1
2,TCGA-C5-A1BE-01B-11R-A13Y-07,1,2094,23727.0,34.232692,0,0,0,0,1,0,0,1
3,TCGA-C5-A8XH-01A-11R-A37O-07,1,1394,14444.0,22.582709,0,0,0,0,1,0,1,0
4,TCGA-DS-A7WF-01A-11R-A352-07,1,492,15319.0,24.609375,0,0,0,1,0,1,0,0


# Load normalized matrisome count data

In [7]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_matrisome_survival_counts_t_df = (
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)]
        .set_index("geneID")                        # set as index so will be column names
        .transpose()
        .rename_axis(None, axis=1)                  # column.name will be set to "geneID", we don't want this
        .reset_index()                              # "sample_name" should now be its own column
        .rename({"index": "sample_name"}, axis=1)
)

In [8]:
joined_df = (
    pd.merge(filtered_survival_df, norm_matrisome_survival_counts_t_df, on="sample_name")
        .drop("vital_status", axis=1)
        .set_index("sample_name")
)

# Optimize a SVR

## First, remove columns where std(x) $\approx$ 0

In [9]:
# filtered_joined_df = joined_df.iloc[:, np.where(joined_df.min(axis=0) != joined_df.max(axis=0))[0]]
filtered_joined_df = joined_df.iloc[:, np.where(joined_df.std(axis=0) > 1e-6)[0]]

In [10]:
def objective(h_params, svr_kernel, X, y):
    if svr_kernel == "linear":
        model = SVR(
            kernel=svr_kernel,
            C=h_params[0],
            epsilon=h_params[1]
        )
    elif svr_kernel == "poly":
        model = SVR(
            kernel=svr_kernel,
            C=h_params[0],
            epsilon=h_params[1],
            gamma=h_params[2],
            degree=h_params[3],
            coef0=h_params[4]
        )
    elif svr_kernel == "rbf":
        model = SVR(
            kernel=svr_kernel,
            C=h_params[0],
            epsilon=h_params[1],
            gamma=h_params[2]
        )
    pipeline = make_pipeline(StandardScaler(), model)
    return -np.mean(cross_val_score(
        model,
        X,
        y,
        cv=KFold(n_splits=5, shuffle=True),
        n_jobs=-1,
        scoring="neg_mean_absolute_error")
    )

In [11]:
# X = mm_norm_df.iloc[:, 1:].values
# y = mm_norm_df.iloc[:, 0].values
X = filtered_joined_df.iloc[:, 1:].values
y = filtered_joined_df.iloc[:, 0].values

In [12]:
space_lin = [
    Real(1e-1, 1e1, name="C"),
    Real(1e-3, 1e-1, name="epsilon")
]

space_poly = [
    Real(1e-1, 1e1, name="C"),
    Real(1e-3, 1e-1, name="epsilon"),
    Real(1e-3, 1e-1, name="gamma"),
    Integer(1, 3, name="degree"),
    Real(-1, 1, name="coef0")
]

space_rbf = [
    Real(1e-1, 1e1, name="C"),
    Real(1e-3, 1e-1, name="epsilon"),
    Real(1e-3, 1e-1, name="gamma")
]

In [13]:
res_gp_linear = gp_minimize(lambda h_ps: objective(h_ps, "linear", X, y), space_lin, verbose=True)

Iteration No: 1 started. Evaluating function at random point.


KeyboardInterrupt: 

In [None]:
print(f"Best values:\n\tC: {res_gp_linear.x[0]}\n\tepsilon: {res_gp_linear.x[1]}")
res_gp_linear.fun

In [None]:
res_gp_poly = gp_minimize(lambda h_ps: objective(h_ps, "poly", X, y), space_poly, verbose=True)

In [None]:
print(f"Best values:\n\tC: {res_gp_poly.x[0]}\n\tepsilon: {res_gp_poly.x[1]}\n\tgamma: {res_gp_poly.x[2]}\n\tdegree: {res_gp_poly.x[3]}\n\tcoef0: {res_gp_poly.x[4]}")
res_gp_poly.fun

In [None]:
res_gp_rbf = gp_minimize(lambda h_ps: objective(h_ps, "rbf", X, y), space_rbf, verbose=True)

In [None]:
print(f"Best values:\n\tC: {res_gp_rbf.x[0]}\n\tepsilon: {res_gp_rbf.x[1]}\n\tgamma: {res_gp_rbf.x[2]}")
res_gp_rbf.fun

In [None]:
svm_opt_results_df = pd.DataFrame.from_dict(
    dict(linear = res_gp_linear.x, poly = res_gp_poly.x, rbf = res_gp_rbf.x),
    orient="index",
    columns=["C", "epsilon", "gamma", "degree", "coef0"]
)
svm_opt_results_df["min"] = [res_gp_linear.fun, res_gp_poly.fun, res_gp_rbf.fun]
svm_opt_results_df = svm_opt_results_df.reset_index().rename(columns={"index": "kernel"})

In [None]:
svm_opt_results_df.to_csv("svr_opt_res.csv", index=False)