#### Random Search vs Grid Search
- Conclusion from the experiment result: random search has better performance (higher accuracy 0.8741 than grid search 0.8357)

In [None]:
# Original Random Search
tuner = tfdf.tuner.RandomSearch(num_trials=1000)
tuner.choice("min_examples", [2, 5, 7, 10])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])

local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8])

global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])

#tuner.choice("use_hessian_gain", [True, False])
tuner.choice("shrinkage", [0.02, 0.05, 0.10, 0.15])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])


tuner.choice("split_axis", ["AXIS_ALIGNED"])
oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization",
                     ["NONE", "STANDARD_DEVIATION", "MIN_MAX"])
oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5])

# Tune the model. Notice the `tuner=tuner`.
tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=0, validation_data=valid_ds)

tuned_self_evaluation = tuned_model.make_inspector().evaluation()
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")

Accuracy: 0.8741258978843689 
Loss:0.7747772932052612

In [None]:
import numpy as np
import pandas as pd
y_true = valid_df['Survived'].values
y_pred = tuned_model.predict(valid_ds)
y_pred = y_pred.round()

In [None]:
from sklearn.metrics import roc_curve, auc

# Get probability predictions for ROC Curve
y_prob = tuned_model.predict(valid_ds).flatten()
# Compute ROC curve
fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

### Grid Search


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import tensorflow_decision_forests as tfdf
import tensorflow as tf
import pandas as pd

class TFDFWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, min_examples=2, categorical_algorithm="CART", max_depth=6, shrinkage=0.1):
        self.min_examples = min_examples
        self.categorical_algorithm = categorical_algorithm
        self.max_depth = max_depth
        self.shrinkage = shrinkage
        self.model = None

    def fit(self, X, y):
        train_df = X.copy()
        train_df['Survived'] = y
        
        train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="Survived").map(tokenize_names)
        
        self.model = tfdf.keras.GradientBoostedTreesModel(
            min_examples=self.min_examples,
            categorical_algorithm=self.categorical_algorithm,
            max_depth=self.max_depth,
            shrinkage=self.shrinkage
        )
        self.model.fit(train_ds, verbose=0)
        return self

    def predict(self, X):
        ds = tfdf.keras.pd_dataframe_to_tf_dataset(X).map(tokenize_names)
        predictions = self.model.predict(ds)
        return predictions.round().flatten()  # Ensure it's flattened for compatibility

    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

# Split the dataset
train_df, valid_df = train_test_split(preprocessed_train_df, test_size=0.2, random_state=42)

X_train = train_df[input_features]
y_train = train_df['Survived']

# Define parameter grid
param_grid = {
    'min_examples': [2, 5, 7, 10],
    'categorical_algorithm': ["CART", "RANDOM"],
    'max_depth': [3, 4, 5, 6, 8],
    'shrinkage': [0.02, 0.05, 0.10, 0.15]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=TFDFWrapper(), param_grid=param_grid, scoring='accuracy', cv=3)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_}")


In [None]:
Best parameters: {'categorical_algorithm': 'RANDOM', 'max_depth': 3, 'min_examples': 7, 'shrinkage': 0.02}
Best accuracy: 0.8357030575943457

In [None]:
import numpy as np
import pandas as pd

# Ground truth values from validation set
y_true = valid_df['Survived'].values

# Predict using the best estimator from GridSearchCV
best_model = grid_search.best_estimator_
y_pred = best_model.predict(valid_df[input_features])

# Round the predictions
y_pred = y_pred.round()

# Check predictions
print(f"True values: {y_true[:10]}")
print(f"Predictions: {y_pred[:10]}")

In [None]:
y_prob = best_model.predict(valid_df[input_features]).flatten()