In [65]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import f1_score
from skopt.space import Real, Integer, Categorical
from skopt import gp_minimize

import utils.dev_config as dev_conf
import utils.preprocessing as prep
import utils.optimization as opt

In [2]:
dirs = dev_conf.get_dev_directories("../dev_paths.txt")
unified_dsets = ["unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data"]

In [70]:
dset_idx = 0

In [71]:
seed = 123
rand = np.random.RandomState()

In [72]:
condition_map = {"healthy": 0, "tumor": 1}

# Load normalized matrisome count data

In [73]:
norm_matrisome_counts_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/norm_matrisome_counts.tsv", sep='\t')
counts_t_df = prep.transpose_df(
    norm_matrisome_counts_df, "geneID", "sample_name"
)
print(counts_t_df.shape)
# norm_filtered_matrisome_counts_t_df.head()

(272, 1009)


In [74]:
coldata_df = pd.read_csv(f"{dirs.data_dir}/{unified_dsets[dset_idx]}/coldata.tsv", sep="\t")

In [75]:
joined_df = (
    pd.merge(coldata_df, counts_t_df, on="sample_name")
        .set_index("sample_name")
        .drop("data_source", axis=1)
        .assign(condition = lambda df: df.condition.apply(lambda x: condition_map[x]))
)

In [76]:
joined_df.condition.value_counts()

1    259
0     13
Name: condition, dtype: int64

# Optimize model

In [77]:
rand.seed(seed)
x_df, y_df = prep.shuffle_data(joined_df, rand)

## Get baselines

In [87]:
rand.seed(seed)

label_value_counts_df = (
    pd.DataFrame(y_df.condition.value_counts()).reset_index()
        .rename(columns={"index": "label", "condition": "n"})
        .sort_values("n", ascending=False)
)

most_frequent_label = label_value_counts_df.label[0]
most_frequent_baseline = f1_score(y_df.values.squeeze(), np.repeat(most_frequent_label, y_df.shape[0]), average="macro")

mc_baseline = opt.mc_classification_baseline(
    y=y_df.values.squeeze(),
    labels=label_value_counts_df.label.values,
    weights=label_value_counts_df.n.values / label_value_counts_df.n.values.sum(),
    metric=lambda y, yhat: f1_score(y, yhat, average="macro"),
    n=1001
)

print(f"Most frequent baseline: {most_frequent_baseline}")
print(f"Monte Carlo baseline: {mc_baseline.mean()}")

Most frequent baseline: 0.487758945386064
Monte Carlo baseline: 0.4998257180356517


In [80]:
model = GradientBoostingClassifier()
# model = GaussianNB()
cv_res = cross_val_score(model, x_df.values, y_df.values.squeeze(), scoring="f1_macro")

In [81]:
cv_res

array([1.        , 1.        , 0.82857143, 0.89514563, 1.        ])

## SMBO

In [82]:
def objective(h_params, X, y, loss_default, scoring_default, r, verbose=True):
    if verbose:
        print(h_params)
    model = GradientBoostingClassifier(
        loss=loss_default,
        learning_rate=h_params[0],
        n_estimators=h_params[1],
        max_depth=h_params[2],
        max_features=h_params[3],
        min_samples_split=h_params[4],
        min_samples_leaf=h_params[5],
        random_state=r
    )
    return -np.mean(cross_val_score(
        model,
        X,
        y,
        cv=KFold(n_splits=5),
        n_jobs=-1,
        scoring=scoring_default
    ))

In [83]:
space = [
    Real(1e-3, 1e-1, name="learning_rate"),
    Integer(int(1e2), int(1e3), name="n_estimators"),
    Integer(2, 5, name="max_depth"),
    Categorical(["auto", "sqrt", "log2"], name="max_features"),
    Integer(int(2), int(6), name="min_samples_split"),
    Integer(int(1), int(3), name="min_samples_leaf")
]
n_initial = 10 * len(space)
n_calls = 50 * len(space)

In [85]:
loss_default = "deviance"
scoring_default = "f1_macro"
callback_file = f"{unified_dsets[dset_idx]}_opt_gbc_cancer_y_n_h_params_{scoring_default}.tsv"

try:
    os.remove(callback_file)
except OSError:
    pass

res = gp_minimize(
    lambda h_ps: objective(h_ps, x_df, y_df, loss_default, scoring_default, rand),
    space,
    verbose=True,
    random_state=rand,
    n_initial_points=n_initial,
    n_calls=n_calls,
    n_jobs=-1,
    callback=lambda x: opt.save_callback(x, callback_file, n = 5, sep="\t")
)

Iteration No: 1 started. Evaluating function at random point.
[0.04393641609527331, 488, 3, 'log2', 3, 2]
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 1.4356
Function value obtained: -0.9879
Current minimum: -0.9879
Iteration No: 2 started. Evaluating function at random point.
[0.08944552714859637, 950, 4, 'log2', 2, 2]
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.6716
Function value obtained: -0.9879
Current minimum: -0.9879
Iteration No: 3 started. Evaluating function at random point.
[0.04206779498340956, 880, 3, 'log2', 6, 2]
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.5022
Function value obtained: -0.9879
Current minimum: -0.9879
Iteration No: 4 started. Evaluating function at random point.
[0.06167655805053381, 209, 4, 'log2', 4, 2]
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.7266
Function value obtained: -0.9879
Current minimum: -0.9879
Iteration No: 5 started. Evaluating function

KeyboardInterrupt: 