In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import f1_score
from skopt.space import Real, Integer, Categorical
from skopt import gp_minimize

import utils.dev_config as dev_conf
import utils.preprocessing as prep
import utils.optimization as opt

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]
matrisome_list = f"{dirs.data_dir}/matrisome/matrisome_hs_masterlist.tsv"

In [3]:
dset_idx = 0

In [5]:
seed = 123
rand = np.random.RandomState()

# Load and filter survival data

In [6]:
event_code = {"Alive": 0, "Dead": 1}
covariate_cols = ["age_at_diagnosis", "race", "ethnicity"]
dep_cols = ["figo_stage"]
cat_cols = ["race", "ethnicity"]
survival_df = prep.load_survival_df(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/survival_data.tsv", event_code)

In [7]:
filtered_survival_df = (
    prep.decode_figo_stage(survival_df[["sample_name"] + dep_cols + covariate_cols].dropna(), to="n")
        .pipe(pd.get_dummies, columns=cat_cols)
        .reset_index(drop = True)
        .pipe(prep.cols_to_front, ["sample_name", "figo_num"])
)
filtered_survival_df.columns = filtered_survival_df.columns.str.replace(' ', '_')
print(filtered_survival_df.shape)
# filtered_survival_df.head()

(255, 12)


# Load normalized matrisome count data

In [8]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
norm_filtered_matrisome_counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df[["geneID"] + list(filtered_survival_df.sample_name)], "geneID", "sample_name"
)
print(norm_filtered_matrisome_counts_t_df.shape)
# norm_filtered_matrisome_counts_t_df.head()

(255, 1009)


# Join survival and count data

In [9]:
joined_df = (
    pd.merge(filtered_survival_df, norm_filtered_matrisome_counts_t_df, on="sample_name")
        .set_index("sample_name")
)
print(joined_df.shape)
# joined_df.head()

(255, 1019)


# Optimize model

In [10]:
rand.seed(seed)
x_df, y_df = prep.shuffle_data(joined_df, rand)

## Get baselines

In [11]:
rand.seed(seed)

label_value_counts_df = (
    pd.DataFrame(y_df.figo_num.value_counts()).reset_index()
        .rename(columns={"index": "label", "figo_num": "n"})
        .sort_values("n", ascending=False)
)

most_frequent_label = label_value_counts_df.label[0]
most_frequent_baseline = f1_score(y_df.values.squeeze(), np.repeat(most_frequent_label, y_df.shape[0]), average="weighted")

mc_baseline = opt.mc_classification_baseline(
    y=y_df.values.squeeze(),
    labels=label_value_counts_df.label.values,
    weights=label_value_counts_df.n.values / label_value_counts_df.n.values.sum(),
    metric=lambda y, yhat: f1_score(y, yhat, average="weighted"),
    n=1001
)

print(f"Most frequent baseline: {most_frequent_baseline}")
print(f"Monte Carlo baseline: {mc_baseline.mean()}")

Most frequent baseline: 0.3665158371040725
Monte Carlo baseline: 0.36708665511486


## SMBO

In [12]:
matrisome_genes = norm_filtered_matrisome_counts_t_df.columns[1:]

def objective(h_params, X, y, penalty_default, scoring_default, r, verbose=True):
    if verbose:
        print(h_params)

    model = make_pipeline(
        ColumnTransformer([
            ("standard", StandardScaler(), ["age_at_diagnosis"] + list(matrisome_genes))
        ], remainder="passthrough"),
        LogisticRegression(
            C=h_params[0],
            class_weight=h_params[1],
            solver=h_params[2],
            penalty=penalty_default,
            n_jobs=-1,
            random_state=r
        )
    )
    return -np.mean(cross_val_score(
        model,
        X,
        y,
        cv=KFold(n_splits=5),
        n_jobs=-1,
        scoring=scoring_default
    ))

In [13]:
l1_space = [
    Real(1e-1, 1e1, name="C"),
    Categorical(["balanced", None], name="class_weight"),
    Categorical(["liblinear", "saga"], name="solver")
]

l2_space = [
    Real(1e-1, 1e1, name="C"),
    Categorical(["balanced", None], name="class_weight"),
    Categorical(["newton-cg", "lbfgs", "sag", "saga"], name="solver")
]

no_penalty_space = [
    Real(1e-1, 1e1, name="C"),
    Categorical(["balanced", None], name="class_weight"),
    Categorical(["newton-cg", "lbfgs", "sag", "saga"], name="solver")
]

n_initial = 10 * len(l1_space)
n_calls = 50 * len(l1_space)

In [14]:
penalty_default = "l2"
scoring_default = "f1_weighted"
space = l2_space
callback_file = f"{unified_dsets[dset_idx]}_opt_lr_h_params_{penalty_default}_{scoring_default}.tsv"

try:
    os.remove(callback_file)
except OSError:
    pass

res = gp_minimize(
    lambda h_ps: objective(h_ps, x_df, y_df, penalty_default, scoring_default, rand),
    space,
    verbose=True,
    random_state=rand,
    n_initial_points=n_initial,
    n_calls=n_calls,
    n_jobs=-1,
    callback=lambda x: opt.save_callback(x, callback_file, n=5, sep="\t")
)

Iteration No: 1 started. Evaluating function at random point.
[7.15825767317995, 'balanced', 'sag']
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 1.2060
Function value obtained: -0.4428
Current minimum: -0.4428
Iteration No: 2 started. Evaluating function at random point.
[7.219588070532254, 'balanced', 'sag']
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.9918
Function value obtained: -0.4568
Current minimum: -0.4568
Iteration No: 3 started. Evaluating function at random point.
[4.1681512898192885, None, 'lbfgs']
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.8097
Function value obtained: -0.5045
Current minimum: -0.5045
Iteration No: 4 started. Evaluating function at random point.
[4.0700738109459165, None, 'newton-cg']
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.6575
Function value obtained: -0.5078
Current minimum: -0.5078
Iteration No: 5 started. Evaluating function at random point.
[2.52



Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.8778
Function value obtained: -0.5052
Current minimum: -0.5078
Iteration No: 38 started. Searching for the next optimal point.
[8.15878630063924, None, 'newton-cg']
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 1.0303
Function value obtained: -0.5052
Current minimum: -0.5078
Iteration No: 39 started. Searching for the next optimal point.
[9.988683894845627, None, 'lbfgs']
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.8075
Function value obtained: -0.5052
Current minimum: -0.5078
Iteration No: 40 started. Searching for the next optimal point.
[0.10635571040233853, 'balanced', 'lbfgs']
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.8605
Function value obtained: -0.4820
Current minimum: -0.5078
Iteration No: 41 started. Searching for the next optimal point.
[0.12082068164413885, None, 'lbfgs']
Iteration N

In [15]:
penalty_default = "l1"
scoring_default = "f1_weighted"
space = l1_space
callback_file = f"{unified_dsets[dset_idx]}_opt_lr_h_params_{penalty_default}_{scoring_default}.tsv"

try:
    os.remove(callback_file)
except OSError:
    pass

res = gp_minimize(
    lambda h_ps: objective(h_ps, x_df, y_df, penalty_default, scoring_default, rand),
    space,
    verbose=True,
    random_state=rand,
    n_initial_points=n_initial,
    n_calls=n_calls,
    n_jobs=-1,
    callback=lambda x: opt.save_callback(x, callback_file, n=5, sep="\t")
)

Iteration No: 1 started. Evaluating function at random point.
[5.519518289447713, 'balanced', 'liblinear']
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.2266
Function value obtained: -0.4767
Current minimum: -0.4767
Iteration No: 2 started. Evaluating function at random point.
[5.636968801124902, 'balanced', 'liblinear']
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2776
Function value obtained: -0.4794
Current minimum: -0.4794
Iteration No: 3 started. Evaluating function at random point.
[1.7512724589040005, 'balanced', 'saga']
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.1512
Function value obtained: -0.4541
Current minimum: -0.4794
Iteration No: 4 started. Evaluating function at random point.
[1.0372567105160115, 'balanced', 'saga']
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.2631
Function value obtained: -0.4692
Current minimum: -0.4794
Iteration No: 5 started. Evaluating function at 

In [16]:
penalty_default = "none"
scoring_default = "f1_weighted"
space = no_penalty_space
callback_file = f"{unified_dsets[dset_idx]}_opt_lr_h_params_{penalty_default}_{scoring_default}.tsv"

try:
    os.remove(callback_file)
except OSError:
    pass

res = gp_minimize(
    lambda h_ps: objective(h_ps, x_df, y_df, penalty_default, scoring_default, rand),
    space,
    verbose=True,
    random_state=rand,
    n_initial_points=n_initial,
    n_calls=n_calls,
    n_jobs=-1,
    callback=lambda x: opt.save_callback(x, callback_file, n=5, sep="\t")
)

Iteration No: 1 started. Evaluating function at random point.
[9.125841153053798, None, 'newton-cg']
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.2538
Function value obtained: -0.4773
Current minimum: -0.4773
Iteration No: 2 started. Evaluating function at random point.
[6.4561372254103935, 'balanced', 'sag']
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.6569
Function value obtained: -0.4456
Current minimum: -0.4773
Iteration No: 3 started. Evaluating function at random point.
[9.484185893687544, None, 'sag']
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.5662
Function value obtained: -0.4784
Current minimum: -0.4784
Iteration No: 4 started. Evaluating function at random point.
[6.281940580275031, 'balanced', 'sag']
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.5421
Function value obtained: -0.4566
Current minimum: -0.4784
Iteration No: 5 started. Evaluating function at random point.
[2.1221



Iteration No: 120 ended. Search finished for the next optimal point.
Time taken: 1.9229
Function value obtained: -0.4782
Current minimum: -0.4927
Iteration No: 121 started. Searching for the next optimal point.
[9.993488328070582, 'balanced', 'newton-cg']
Iteration No: 121 ended. Search finished for the next optimal point.
Time taken: 2.0357
Function value obtained: -0.4782
Current minimum: -0.4927
Iteration No: 122 started. Searching for the next optimal point.
[9.987766294195165, None, 'sag']
Iteration No: 122 ended. Search finished for the next optimal point.
Time taken: 2.2947
Function value obtained: -0.4697
Current minimum: -0.4927
Iteration No: 123 started. Searching for the next optimal point.
[9.999565742415841, None, 'newton-cg']
Iteration No: 123 ended. Search finished for the next optimal point.
Time taken: 2.1414
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 124 started. Searching for the next optimal point.
[0.10869228652387312, None, 'saga']
Ite



Iteration No: 129 ended. Search finished for the next optimal point.
Time taken: 2.0100
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 130 started. Searching for the next optimal point.
[0.10493108586639154, None, 'sag']
Iteration No: 130 ended. Search finished for the next optimal point.
Time taken: 2.9450
Function value obtained: -0.4787
Current minimum: -0.4927
Iteration No: 131 started. Searching for the next optimal point.
[9.992915339989985, None, 'newton-cg']
Iteration No: 131 ended. Search finished for the next optimal point.
Time taken: 2.0084
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 132 started. Searching for the next optimal point.
[9.992582277259599, None, 'sag']
Iteration No: 132 ended. Search finished for the next optimal point.
Time taken: 2.4801
Function value obtained: -0.4790
Current minimum: -0.4927
Iteration No: 133 started. Searching for the next optimal point.
[9.998538742765408, None, 'saga']
Iteration No: 1



Iteration No: 137 ended. Search finished for the next optimal point.
Time taken: 2.0305
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 138 started. Searching for the next optimal point.
[0.10352606587413607, None, 'saga']
Iteration No: 138 ended. Search finished for the next optimal point.
Time taken: 2.4453
Function value obtained: -0.4719
Current minimum: -0.4927
Iteration No: 139 started. Searching for the next optimal point.
[0.10250445886730777, 'balanced', 'newton-cg']
Iteration No: 139 ended. Search finished for the next optimal point.
Time taken: 2.1323
Function value obtained: -0.4782
Current minimum: -0.4927
Iteration No: 140 started. Searching for the next optimal point.
[0.1, None, 'newton-cg']




Iteration No: 140 ended. Search finished for the next optimal point.
Time taken: 2.1059
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 141 started. Searching for the next optimal point.
[0.1026249892735415, None, 'sag']
Iteration No: 141 ended. Search finished for the next optimal point.
Time taken: 2.5755
Function value obtained: -0.4771
Current minimum: -0.4927
Iteration No: 142 started. Searching for the next optimal point.
[0.1, None, 'newton-cg']




Iteration No: 142 ended. Search finished for the next optimal point.
Time taken: 2.1093
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 143 started. Searching for the next optimal point.
[9.99601543912335, 'balanced', 'newton-cg']
Iteration No: 143 ended. Search finished for the next optimal point.
Time taken: 2.2233
Function value obtained: -0.4782
Current minimum: -0.4927
Iteration No: 144 started. Searching for the next optimal point.
[9.999940799922385, None, 'newton-cg']
Iteration No: 144 ended. Search finished for the next optimal point.
Time taken: 2.6730
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 145 started. Searching for the next optimal point.
[9.986656786855585, None, 'saga']
Iteration No: 145 ended. Search finished for the next optimal point.
Time taken: 2.7938
Function value obtained: -0.4724
Current minimum: -0.4927
Iteration No: 146 started. Searching for the next optimal point.
[0.10299906535051306, 'balanced', 'saga



Iteration No: 147 ended. Search finished for the next optimal point.
Time taken: 2.8790
Function value obtained: -0.4773
Current minimum: -0.4927
Iteration No: 148 started. Searching for the next optimal point.
[0.11979481274727327, 'balanced', 'lbfgs']
Iteration No: 148 ended. Search finished for the next optimal point.
Time taken: 2.2972
Function value obtained: -0.4566
Current minimum: -0.4927
Iteration No: 149 started. Searching for the next optimal point.
[0.10099259028687159, None, 'lbfgs']
Iteration No: 149 ended. Search finished for the next optimal point.
Time taken: 2.3918
Function value obtained: -0.4608
Current minimum: -0.4927
Iteration No: 150 started. Searching for the next optimal point.
[0.1, None, 'newton-cg']




Iteration No: 150 ended. Search finished for the next optimal point.
Time taken: 2.2208
Function value obtained: -0.4773
Current minimum: -0.4927
