# 📚 **Installation**

In [1]:
import bz2
import time
import mlflow
import joblib
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna.visualization as ov
from bokeh.io import export_svgs
from bokeh.layouts import row, gridplot
from bokeh.plotting import figure, show
from bokeh.palettes import viridis, cividis
from bokeh.models import ColumnDataSource, Range1d
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, cross_val_score
from textwrap import wrap
from matplotlib.cm import get_cmap
from sklearn.impute import SimpleImputer
from optuna.exceptions import TrialPruned

import ipywidgets

from sklearn.preprocessing import (StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler,
                                   OneHotEncoder, OrdinalEncoder, PolynomialFeatures, 
                                   QuantileTransformer,  PowerTransformer)
import mlflow.sklearn

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer, PowerTransformer, OneHotEncoder, OrdinalEncoder

from optuna.integration import MLflowCallback
from sklearn.experimental import enable_hist_gradient_boosting
from category_encoders import TargetEncoder, BinaryEncoder, HashingEncoder, HelmertEncoder

from tqdm.notebook import tqdm
from tqdm import tqdm



# ⚠️ **Warnings**

In [2]:
import logging
import warnings
logging.basicConfig(level=logging.INFO)

logging.disable(logging.WARNING)
optuna.logging.disable_default_handler()
pd.options.mode.chained_assignment = None 
optuna.logging.set_verbosity(optuna.logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# 📂 **Data**

In [3]:
# Create a DataFrame from CSV data
result_df = pd.read_csv("FE_golf.csv")

# 🍽️ **Pre-Processing**

In [4]:
# Assume result_df is already defined and preprocessed up to this point
result_df = result_df[result_df['from_location_scorer'] == 'Green']
result_df = result_df.drop(columns=['from_location_scorer', 'from_location_laser'])

# Feature columns and target column
feature_cols = ["ISS", "ISC", 'slope', 'elevation',
                'putting_dist_from_center_bins','putting_distance_to_pin_bins','putting_dist_from_edge_bins', 'CSI', 'round', 'first_putt', 'par_value',
                'distance_to_pin', 'dist_from_edge', 'pin_minus_edge', 'SSI', 'hole_completeness', 
                'dist_from_center']
categorical_cols = ['round', 'par_value', 'slope', 
                    'elevation', 'putting_dist_from_center_bins','putting_dist_from_edge_bins','putting_distance_to_pin_bins', 'first_putt']
numerical_cols = ["ISS", "ISC", "distance_to_pin", "dist_from_edge", "pin_minus_edge", 
                  "SSI", "hole_completeness", "dist_from_center"]

# 🔪 **Split**

In [5]:
X = result_df[feature_cols]
y = result_df['strokes_to_hole_out']

# Extract the columns for stratification
stratify_cols = ['putting_distance_to_pin_bins','putting_dist_from_center_bins','putting_dist_from_edge_bins']
stratify_data = result_df[stratify_cols]

# Split the data, using 'stratify_data' for stratification
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=stratify_data)

# 🔄 **MLFlow**

Navigate to the terminal and create an MLFlow instance:

``` console

mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 0.0.0.0 --port 8080

```

In [6]:
mlflow.set_tracking_uri("http://localhost:8080")

experiment_name = "Final Sunday FunDay Base GradientBoostingRegressor"

mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///C:/Users/decla/mlruns/8', creation_time=1716758231508, experiment_id='8', last_update_time=1716758231508, lifecycle_stage='active', name='Final Sunday FunDay Base GradientBoostingRegressor', tags={}>

# 🧰 **Modularize**

In [7]:
def get_scaler(trial):
    scaler_type = trial.suggest_categorical('scaler', ['standard', 'minmax', 'maxabs', 'robust', 'quantile', 'power'])
    if scaler_type == 'standard':
        return StandardScaler()
    elif scaler_type == 'minmax':
        return MinMaxScaler()
    elif scaler_type == 'maxabs':
        return MaxAbsScaler()
    elif scaler_type == 'robust':
        return RobustScaler()
    elif scaler_type == 'quantile':
        return QuantileTransformer(output_distribution='normal', n_quantiles=trial.suggest_int('n_quantiles', 10, 1000))
    elif scaler_type == 'power':
        return PowerTransformer()
    else:
        raise ValueError("Unknown scaler type")


def get_encoder(trial):
    encoder_type = trial.suggest_categorical('encoder', ['onehot', 'ordinal', 'target', 'binary', 'hashing', 'helmert'])
    if encoder_type == 'onehot':
        return OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
    elif encoder_type == 'ordinal':
        return OrdinalEncoder()
    elif encoder_type == 'target':
        return TargetEncoder()
    elif encoder_type == 'binary':
        return BinaryEncoder()
    elif encoder_type == 'hashing':
        return HashingEncoder()
    elif encoder_type == 'helmert':
        return HelmertEncoder()
    else:
        raise ValueError("Unknown encoder type")

def get_imputer(strategy, fill_value=None):
    if strategy == 'constant' and fill_value is not None:
        return SimpleImputer(strategy=strategy, fill_value=fill_value)
    else:
        return SimpleImputer(strategy=strategy)

# 🚂 **Train**

In [8]:
# Define the objective function with MLflow logging
def objective(trial):
    try:
        with mlflow.start_run(nested=True):

            # Suggest scaler type
            scaler = get_scaler(trial)
            # Suggest encoder type
            encoder =  get_encoder(trial)

            numerical_imputer_strategy = trial.suggest_categorical(
                'numerical_imputer_strategy', ['mean', 'median', 'most_frequent', 'constant']
            )
            categorical_imputer_strategy = trial.suggest_categorical(
                'categorical_imputer_strategy', ['most_frequent', 'constant']
            )
            
            numerical_imputer = get_imputer(numerical_imputer_strategy, fill_value=-1)
            categorical_imputer = get_imputer(categorical_imputer_strategy, fill_value='missing')
            
            # Define the ColumnTransformer with the chosen scaler and imputer for numerical columns
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline(steps=[
                        ('imputer', numerical_imputer),
                        ('scaler', scaler)
                    ]), numerical_cols),
                    ('cat', Pipeline(steps=[
                        ('imputer', categorical_imputer),
                        ('encoder', encoder)
                    ]), categorical_cols)
                ],
                remainder='drop'  # Drop columns not specified in transformers
            )

            # Feature selection (optional)
            feature_selector_type = trial.suggest_categorical('feature_selector', ['none', 'kbest', 'model'])
            if feature_selector_type == 'kbest':
                from sklearn.feature_selection import SelectKBest, f_regression
                feature_selector = SelectKBest(score_func=f_regression, k=trial.suggest_int('k', 5, 20))
                preprocessor.transformers.append(('feature_selector', feature_selector, []))  # Add empty list for columns
            elif feature_selector_type == 'model':
                from sklearn.feature_selection import SelectFromModel
                feature_selector = SelectFromModel(estimator=GradientBoostingRegressor(n_estimators=50))
                preprocessor.transformers.append(('feature_selector', feature_selector, []))  # Add empty list for columns

            # Polynomial features (optional)
            poly_degree = trial.suggest_int('poly_degree', 1, 3)
            if poly_degree > 1:
                preprocessor.transformers.append(('poly', PolynomialFeatures(degree=poly_degree, include_bias=False), []))  # Add empty list for columns

            # Suggest hyperparameters for the model
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 150, 600),
                "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
                "max_depth": trial.suggest_int("max_depth", 3, 7),
                "subsample": trial.suggest_uniform("subsample", 0.4, 0.8),
                "min_samples_split": trial.suggest_int("min_samples_split", 4, 8),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 3, 7),
                "max_features": trial.suggest_uniform("max_features", 0.4, 0.99),
            }
            
            # Create the pipeline
            model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('regressor', GradientBoostingRegressor(**params, random_state=42))
            ])

            # Use StratifiedKFold with 'putting_distance_to_pin_bins' for nested cross-validation
            outer_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            outer_scores = []

            for outer_fold, (outer_train_idx, outer_val_idx) in enumerate(outer_kf.split(X, X['putting_distance_to_pin_bins'])):
                # Add a time.sleep() call to slow down the execution
                # time.sleep(0.1)  # Adjust the sleep time as needed for tqdm!
                X_outer_train, X_outer_val = X.iloc[outer_train_idx], X.iloc[outer_val_idx]
                y_outer_train, y_outer_val = y.iloc[outer_train_idx], y.iloc[outer_val_idx]
                
                # Use StratifiedKFold with 'putting_distance_to_pin_bins' for inner cross-validation
                inner_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                val_scores = []
                
                for inner_fold, (inner_train_idx, inner_val_idx) in enumerate(inner_kf.split(X_outer_train, X_outer_train['putting_distance_to_pin_bins'])):
                    X_inner_train, X_inner_val = X_outer_train.iloc[inner_train_idx], X_outer_train.iloc[inner_val_idx]
                    y_inner_train, y_inner_val = y_outer_train.iloc[inner_train_idx], y_outer_train.iloc[inner_val_idx]
                    
                    model.fit(X_inner_train, y_inner_train)
                    preds = model.predict(X_inner_val)
                    val_score = mean_absolute_error(y_inner_val, preds)
                    # val_score = mean_squared_error(y_inner_val, preds)
                    val_scores.append(val_score)
                    
                    # Report the intermediate value
                    trial.report(val_score, inner_fold)
                    
                    # Check for pruning
                    if trial.should_prune():
                        mlflow.log_metric('pruned', 1)
                        raise TrialPruned()

                mean_inner_val_score = np.mean(val_scores)
                outer_scores.append(mean_inner_val_score)
                
                # Log the outer fold score
                mlflow.log_metric(f'outer_fold_{outer_fold}_score', mean_inner_val_score)

            mean_outer_score = np.mean(outer_scores)
            
            # Log parameters and metrics to MLflow
            params_to_log = trial.params
            for param_name, param_value in params_to_log.items():
                try:
                    mlflow.log_param(param_name, param_value)
                except mlflow.exceptions.MlflowException:
                    pass  # Ignore if the parameter is already logged

            mlflow.log_metric('mean_outer_score', mean_outer_score)

            # Log trained model
            with bz2.BZ2File('model.joblib.bz2', 'wb', compresslevel=9) as f:
                joblib.dump(model, f)
            mlflow.log_artifact("model.joblib.bz2")
            
            # Return mean_outer_score
            return mean_outer_score
    
    except TrialPruned:
        mlflow.log_metric('pruned', 1)
        raise
    except Exception as e:
        # Log the error if needed
        mlflow.log_metric('failed', 1)
        mlflow.log_param('error_message', str(e))
        print(f"Trial failed with exception: {e}")
        return None

# Create and run the Optuna study with pruning
pruner = optuna.pruners.MedianPruner()
sampler = optuna.samplers.CmaEsSampler()

study = optuna.create_study(direction='minimize', 
                            sampler=sampler,
                            pruner=pruner)

trials_todo = 50

with tqdm(total=trials_todo) as pbar:
    def update_progress(study, trial):
        pbar.update(1)

    study.optimize(objective, n_trials=trials_todo, callbacks=[update_progress])

# Print the best hyperparameters and scaler
print('Best trial:')
best_trial = study.best_trial

# Log best trial information
mlflow.log_metric('best_val_score', best_trial.value)
for key, value in best_trial.params.items():
    try:
        mlflow.log_param(key, value)
    except mlflow.exceptions.MlflowException:
        pass  # Ignore if the parameter is already logged

print('  Value: {:.4f}'.format(best_trial.value))
print('  Params: ') 
for key, value in best_trial.params.items():
    print('    {}: {}'.format(key, value))


100%|██████████| 50/50 [30:59<00:00, 37.19s/it]   


Best trial:
  Value: 0.0365
  Params: 
    scaler: standard
    encoder: binary
    numerical_imputer_strategy: most_frequent
    categorical_imputer_strategy: constant
    feature_selector: none
    poly_degree: 2
    n_estimators: 425
    learning_rate: 0.019636965052313247
    max_depth: 4
    subsample: 0.5670754430253833
    min_samples_split: 6
    min_samples_leaf: 3
    max_features: 0.7578529102412059


# 🏁 **Results**

In [9]:
ov.plot_optimization_history(study)

In [10]:
ov.plot_param_importances(study)

In [13]:
ov.plot_contour(study, params=["encoder", "n_estimators"])