In [1]:
import pandas as pd
import numpy as np
from time import perf_counter
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from skopt.space import Real, Integer, Categorical
from skopt import gp_minimize

import utils.dev_config as dev_conf
import utils.preprocessing as prep
import utils.optimization as opt

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
i = 0

In [4]:
matrisome_df = prep.load_matrisome_df(matrisome_list)

In [5]:
seed = 123
rand = np.random.RandomState()

# Load and filter survival data

In [6]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["figo_stage"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[i]}/survival_data.tsv", event_code)

In [7]:
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="n")
        .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
        .pipe(prep.cols_to_front, ["sample_name", "figo_num"])
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')
print(filtered_survival_df.shape)
# filtered_survival_df.head()

(255, 12)


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[i]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)
print(norm_filtered_matrisome_counts_t_df.shape)
# norm_filtered_matrisome_counts_t_df.head()

(255, 1009)


# Join survival and count data

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)

print(joined_df.shape)
# joined_df.head()

(255, 1019)


# Optimize model

In [22]:
x_df, y_df = prep.shuffle_data(joined_df, rand, seed=seed)

In [26]:
def objective(h_params, X, y, verbose=True):
    if verbose:
        print(h_params)
    model = SVC(
        kernel=h_params[0],
        C=h_params[1],
        gamma=h_params[2],
        degree=h_params[3],
        coef0=h_params[4],
        class_weight=h_params[5]
    )
    # Standardize all variables (except one-hots)
    pipeline = make_pipeline(
        ColumnTransformer([
            ("standard", StandardScaler(), ["age_at_diagnosis"] + list(norm_filtered_matrisome_counts_t_df.columns[1:]))
        ], remainder="passthrough"),
        model
    )
    return -np.mean(cross_val_score(
        pipeline,
        X,
        y,
        cv=KFold(n_splits=5),
        n_jobs=-1,
        scoring="f1_weighted"
    ))

In [24]:
space = [
    Categorical(["linear", "rbf", "poly"], name="kernel"),
    Real(1e-1, 1e1, name="C"),
    Real(1e-3, 1e-1, name="gamma"),
    Integer(1, 3, name="degree"),
    Real(-1, 1, name="coef0")
    Categorical(["balanced", None], name="class_weights")
]
n_initial = 10 * (len(space[0].categories))
n_calls = 100 * (len(space[0].categories))

In [25]:
res = gp_minimize(
    lambda h_ps: objective(h_ps, x_df, y_df),
    space,
    verbose=True,
    random_state=rand,
    n_initial_points=n_initial,
    n_calls = n_calls,
    n_jobs=-1,
    callback=lambda x: opt.save_callback(x, "opt_svc_h_params_TEST.tsv", sep="\t")
)

Iteration No: 1 started. Evaluating function at random point.
['rbf', 9.775715921772743, 0.012891978002009141, 2, -0.41458075403961614]
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.2587
Function value obtained: -0.3691
Current minimum: -0.3691
Iteration No: 2 started. Evaluating function at random point.
['rbf', 5.610638904936243, 0.06571975323605078, 3, -0.3056630018784391]
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2442
Function value obtained: -0.3691
Current minimum: -0.3691
Iteration No: 3 started. Evaluating function at random point.
['rbf', 9.962181971599627, 0.08635660955369447, 3, -0.40967858109762656]
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2587
Function value obtained: -0.3691
Current minimum: -0.3691
Iteration No: 4 started. Evaluating function at random point.
['rbf', 8.892212203167828, 0.04041162284821849, 1, -0.3040493282827196]
Iteration No: 4 ended. Evaluation done at random point.
Time t



Iteration No: 57 ended. Search finished for the next optimal point.
Time taken: 0.9195
Function value obtained: -0.4560
Current minimum: -0.4560
Iteration No: 58 started. Searching for the next optimal point.
['rbf', 0.2299100314122572, 0.002353985621404827, 1, 0.5759367647849147]
Iteration No: 58 ended. Search finished for the next optimal point.
Time taken: 1.0877
Function value obtained: -0.3691
Current minimum: -0.4560
Iteration No: 59 started. Searching for the next optimal point.
['rbf', 10.0, 0.001, 1, 1.0]




Iteration No: 59 ended. Search finished for the next optimal point.
Time taken: 0.9253
Function value obtained: -0.4560
Current minimum: -0.4560
Iteration No: 60 started. Searching for the next optimal point.
['rbf', 9.84071380617674, 0.006147595959925592, 1, -0.02266311931532261]
Iteration No: 60 ended. Search finished for the next optimal point.
Time taken: 1.0123
Function value obtained: -0.3691
Current minimum: -0.4560
Iteration No: 61 started. Searching for the next optimal point.
['rbf', 10.0, 0.001, 1, 1.0]




Iteration No: 61 ended. Search finished for the next optimal point.
Time taken: 1.0724
Function value obtained: -0.4560
Current minimum: -0.4560
Iteration No: 62 started. Searching for the next optimal point.
['rbf', 0.12238434017810895, 0.013631521113591328, 3, 0.883136156849686]
Iteration No: 62 ended. Search finished for the next optimal point.
Time taken: 0.9030
Function value obtained: -0.3691
Current minimum: -0.4560
Iteration No: 63 started. Searching for the next optimal point.
['rbf', 9.955473567461334, 0.02689619114179878, 1, -0.3470782697467697]
Iteration No: 63 ended. Search finished for the next optimal point.
Time taken: 0.9785
Function value obtained: -0.3691
Current minimum: -0.4560
Iteration No: 64 started. Searching for the next optimal point.
['rbf', 10.0, 0.001, 1, -1.0]




Iteration No: 64 ended. Search finished for the next optimal point.
Time taken: 1.1523
Function value obtained: -0.4560
Current minimum: -0.4560
Iteration No: 65 started. Searching for the next optimal point.
['rbf', 10.0, 0.001, 1, 1.0]




Iteration No: 65 ended. Search finished for the next optimal point.
Time taken: 1.2133
Function value obtained: -0.4560
Current minimum: -0.4560
Iteration No: 66 started. Searching for the next optimal point.
['rbf', 10.0, 0.001, 1, -1.0]




Iteration No: 66 ended. Search finished for the next optimal point.
Time taken: 1.9350
Function value obtained: -0.4560
Current minimum: -0.4560
Iteration No: 67 started. Searching for the next optimal point.
['rbf', 0.2371498500759563, 0.015394882709605075, 1, 0.951747729959938]
Iteration No: 67 ended. Search finished for the next optimal point.
Time taken: 1.5772
Function value obtained: -0.3691
Current minimum: -0.4560
Iteration No: 68 started. Searching for the next optimal point.
['rbf', 10.0, 0.001, 1, 1.0]




KeyboardInterrupt: 