### Bayesian Optimization vs Random Search
- Conclude: Based on Accuracy, the original code with random search has better performance than Bayesian Optimization. 

In [None]:
# Keep TFDFWrapper the same used in GridSearch

class TFDFWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, min_examples=2, categorical_algorithm="CART", max_depth=6, shrinkage=0.1):
        self.min_examples = min_examples
        self.categorical_algorithm = categorical_algorithm
        self.max_depth = max_depth
        self.shrinkage = shrinkage
        self.model = None

    def fit(self, X, y):
        train_df = X.copy()
        train_df['Survived'] = y
        
        train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="Survived").map(tokenize_names)
        
        self.model = tfdf.keras.GradientBoostedTreesModel(
            min_examples=self.min_examples,
            categorical_algorithm=self.categorical_algorithm,
            max_depth=self.max_depth,
            shrinkage=self.shrinkage
        )
        self.model.fit(train_ds, verbose=0)
        return self

    def predict(self, X):
        ds = tfdf.keras.pd_dataframe_to_tf_dataset(X).map(tokenize_names)
        predictions = self.model.predict(ds)
        return predictions.round().flatten()  # Ensure it's flattened for compatibility

    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

# Split the dataset
train_df, valid_df = train_test_split(preprocessed_train_df, test_size=0.2, random_state=42)

X_train = train_df[input_features]
y_train = train_df['Survived']

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV
import tensorflow as tf
import tensorflow_decision_forests as tfdf

# Define the search space for Bayesian Optimization
param_space = {
    'min_examples': (2, 10),  # Integer range
    'categorical_algorithm': ["CART", "RANDOM"],
    'max_depth': (3, 8),      # Integer range
    'shrinkage': (0.02, 0.15) # Float range
}

# Initialize Bayesian Search
bayes_search = BayesSearchCV(
    estimator=TFDFWrapper(),
    search_spaces=param_space,
    n_iter=20,               # Number of iterations (trials)
    cv=3,                    # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=1,  # Disable parallel processing           
    verbose=1,
    random_state=42
)

# Fit the model using Bayesian Optimization
bayes_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best accuracy: {bayes_search.best_score_}")

In [None]:
Best parameters: OrderedDict([('categorical_algorithm', 'RANDOM'), ('max_depth', 8), ('min_examples', 10), ('shrinkage', 0.021727829765960688)])
Best accuracy: 0.8244512995071446

In [None]:
best_model = bayes_search.best_estimator_

y_true = valid_df['Survived'].values
y_pred = best_model.predict(valid_df[input_features])
y_pred = y_pred.round()
y_prob = best_model.predict(valid_df[input_features]).flatten()