In [29]:
import utils
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
import joblib
from skrub import TableVectorizer
import optuna
from sklearn.model_selection import cross_val_score

In [30]:
X, y, X_test = utils.get_and_process_data()

  (non_nan_values - value).abs().argmin()
  (non_nan_values - value).abs().argmin()
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["log_bike_count"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [45]:
def create_pipeline_TV(df, model=None):
    # Best params XGBoost from optuna
    best_params_XGBoost = {
    "n_estimators": 190,
    "max_depth": 6,
    "learning_rate": 0.11981406065938047,
    "subsample": 0.6544353679306312,
    "colsample_bytree": 0.6106475983764972,
    "random_state": 42,
}
    # Define TableVectorizer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            (
                "vectorizer",
                TableVectorizer(),
                df.columns,
            )  # Apply TableVectorizer to all columns
        ]
    )

    # Use the provided model or initialize a default XGBRegressor
    if model is None:
        model = XGBRegressor(**best_params_XGBoost)

    # Create the pipeline
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    return pipeline

In [27]:
# Define the Optuna objective function for minimizing RMSE
def objective(trial, X, y):
    # Sample hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

    # Create model with sampled hyperparameters
    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42,
    )

    # Create the pipeline
    pipeline = create_pipeline_TV(df=X, model=model)

    # Evaluate using cross-validation
    scores = cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_squared_error")

    # Convert negative MSE to RMSE
    rmse = (-scores.mean()) ** 0.5
    return rmse

# Run the Optuna study
def run_optuna(X, y):
    study = optuna.create_study(direction="minimize")  # Minimize RMSE
    study.optimize(lambda trial: objective(trial, X, y), n_trials=50)
    return study

In [28]:
study = run_optuna(X, y)

[I 2024-12-11 15:26:54,052] A new study created in memory with name: no-name-f10783f7-c8c2-4705-91fa-87e59c9520d7
[I 2024-12-11 15:29:35,755] Trial 0 finished with value: 1.0468615261982743 and parameters: {'n_estimators': 55, 'max_depth': 8, 'learning_rate': 0.02478694950047872, 'subsample': 0.9483075594644363, 'colsample_bytree': 0.9328490240627512}. Best is trial 0 with value: 1.0468615261982743.
[I 2024-12-11 15:33:06,138] Trial 1 finished with value: 0.9064045632955889 and parameters: {'n_estimators': 280, 'max_depth': 4, 'learning_rate': 0.06253951751330068, 'subsample': 0.6925060935043361, 'colsample_bytree': 0.7483899713599684}. Best is trial 1 with value: 0.9064045632955889.
[I 2024-12-11 15:36:12,121] Trial 2 finished with value: 0.8929402019598959 and parameters: {'n_estimators': 224, 'max_depth': 5, 'learning_rate': 0.09576925928683748, 'subsample': 0.9337786083560946, 'colsample_bytree': 0.733628654492804}. Best is trial 2 with value: 0.8929402019598959.
[I 2024-12-11 15:3

KeyboardInterrupt: 

In [None]:
# Best parameters and score
print("Best Parameters:", study.best_params)
print("Best Score:", study.best_value)

In [None]:
best_params_XGBoost = {
    "n_estimators": 190,
    "max_depth": 6,
    "learning_rate": 0.11981406065938047,
    "subsample": 0.6544353679306312,
    "colsample_bytree": 0.6106475983764972,
}

In [46]:
pipeline = create_pipeline_TV(X)

In [47]:
pipeline.fit(X, y)

In [48]:
submission = utils.test_fit_and_submission(X_test, pipeline)