In [None]:
import sys
sys.path.append('../')

import optuna
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from src.preprocessing import preprocess_data, make_pipeline
from src.train import build_model

TRAIN_SIZE = 0.8

def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'r2',
        'learning_rate': trial.suggest_float('eta', 0.01, 0.3),
        'num_boost_round': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    Xtrain, Xval, Ytrain, Yval = train_test_split(X_train, y_train, train_size=TRAIN_SIZE, random_state=0) 
    
    pipeline = make_pipeline(build_model())
    print(pipeline.named_steps)

    val_pipeline = pipeline[:-1]
    val_pipeline.fit(Xtrain, Ytrain)
    x_val = val_pipeline.transform(Xval)
    
    # Fit model on train fold and use validation for early stopping
    pipeline.fit(Xtrain, Ytrain, fit_params=params, xgb__eval_set=[(Xval, Yval)], xgb__verbose=False)

    # Predict on test set
    y_pred = pipeline.predict(Xval)
    r2 = r2_score(Yval, y_pred)
    
    return r2
    

X = preprocess_data()
y = X.pop('PolyPwr')

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, random_state=0) 

print(X_train.shape, X_test.shape)
print(X_train.columns)


# # Define the objective function for Optuna
# def objective(trial):
#     # Define the search space for hyperparameters
#     param = {
#         'objective': 'reg:squarederror',
#         'eval_metric': 'rmse',
#         'eta': trial.suggest_float('eta', 0.01, 0.3),
#         'num_boost_round': 100000, # Fix the boosting round and use early stopping
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'gamma': trial.suggest_float('gamma', 0.0, 10.0),
#         'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
#         'lambda': trial.suggest_float('lambda', 0.1, 10.0),
#         'alpha': trial.suggest_float('alpha', 0.0, 10.0),
#     }
    
#     # Split the data into further training and validation sets (three sets are preferable)
#     train_data, valid_data, train_target, valid_target = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
#     # Convert the data into DMatrix format
#     dtrain = xgb.DMatrix(train_data, label=train_target)
#     dvalid = xgb.DMatrix(valid_data, label=valid_target)
    
#     # Define the pruning callback for early stopping
#     pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-rmse')
    
#     # Train the model with early stopping
#     model = xgb.train(param, dtrain, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
#     # Make predictions on the test set
#     dtest = xgb.DMatrix(valid_data)
#     y_pred = model.predict(dtest)
    
#     # Calculate the root mean squared error
#     rmse = mean_squared_error(valid_test, y_pred, squared=False)
    
#     return rmse

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params = study.best_params
best_r2 = study.best_value
print("Best Hyperparameters: ", best_params)
print("Best R2: ", best_r2)
