In [1]:
import utils
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack
import pandas as pd
from skrub import TableReport, TableVectorizer
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates
import numpy as np
import joblib

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import optuna
import joblib

def objective(trial, X, y):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
    }

    # Define the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            (
                "vectorizer",
                TableVectorizer(),
                list(X.columns),  # Use feature columns dynamically
            )
        ]
    )

    # Define the model and pipeline
    model = RandomForestRegressor(**params, random_state=42)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)
    return -scores.mean()

def create_pipeline_TV_with_optuna(X, y, n_trials=50):
    # Initialize Optuna study
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X, y), n_trials=n_trials)

    # Get the best parameters
    best_params = study.best_params
    joblib.dump(best_params, "random_forest_best_params.pkl")  # Save best parameters

    # Create the final pipeline with the best parameters
    preprocessor = ColumnTransformer(
        transformers=[
            (
                "vectorizer",
                TableVectorizer(),
                list(X.columns),  # Use feature columns dynamically
            )
        ]
    )
    model = RandomForestRegressor(**best_params, random_state=42)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    return pipeline


In [3]:
X, y, X_test = utils.get_and_process_data()

  (non_nan_values - value).abs().argmin()
  (non_nan_values - value).abs().argmin()
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["log_bike_count"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [4]:
pipeline = create_pipeline_TV_with_optuna(X,y)

[I 2024-12-11 16:27:46,666] A new study created in memory with name: no-name-75ea7eb3-266a-4150-b0dc-924d0baee376
[I 2024-12-11 16:36:19,764] Trial 0 finished with value: 0.9951861063458726 and parameters: {'n_estimators': 386, 'max_depth': 26, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.9951861063458726.
[I 2024-12-11 16:37:22,058] Trial 1 finished with value: 1.8079099358469743 and parameters: {'n_estimators': 357, 'max_depth': 4, 'min_samples_split': 13, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.9951861063458726.
[I 2024-12-11 16:39:18,846] Trial 2 finished with value: 1.3338209076415353 and parameters: {'n_estimators': 250, 'max_depth': 14, 'min_samples_split': 13, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.9951861063458726.
[I 2024-12-11 16:40:05,228] Trial 3 finished with value: 2.05865232785829

KeyboardInterrupt: 

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline

# Define GPU-enabled regressors
xgb_regressor = XGBRegressor(tree_method='gpu_hist', gpu_id=0, random_state=42)
lgbm_regressor = LGBMRegressor(device='gpu', gpu_device_id=0, random_state=42)

# Define CPU-based Random Forest
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Columns to exclude from scaling
exclude_columns = ["is_weekend", "is_school_holiday", "road_work", "confinement", "couvre_feu"]
numerical_features = [
    col for col in X.columns if col not in exclude_columns + ["counter_name"]
]

# Preprocessing: One-Hot Encode "counter_name" and scale numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical columns
        ('cat', OneHotEncoder(), ['counter_name'])      # One-hot encode "counter_name"
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Define the Voting Regressor
voting_regressor = VotingRegressor([
    ('xgb', xgb_regressor),
    ('lgbm', lgbm_regressor),
    ('rf', rf_regressor)
])

# Complete Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocessing step
    ('voting_regressor', voting_regressor)  # Voting regressor
])

# # Fit the pipeline
pipeline.fit(X, y)

# # Make predictions
# predictions = pipeline.predict(X_test)




AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
data = pd.read_parquet("data/train.parquet")
data = data.sort_values(["date", "counter_name"])
data_test = pd.read_parquet("data/final_test.parquet")
data_test = data_test.sort_values(["date", "counter_name"])


In [None]:
external_conditions = pd.read_csv('data/external_data.csv')
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

In [None]:
# Step 1: Sort the `external_conditions` DataFrame by the `date` column
external_conditions = external_conditions.sort_values(by='date')

# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 2: Remove duplicate entries based on the `date` column
external_conditions = external_conditions.drop_duplicates(subset='date')

# Step 3: Convert the 'date' column to datetime
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

# Step 4: Create a complete date range from the minimum to the maximum date in the DataFrame
date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')

# Step 5: Create a DataFrame from the date_range
date_range_df = pd.DataFrame(date_range, columns=['date'])

# Step 6: Merge the date_range DataFrame with the external_conditions DataFrame on the 'date' column
full_external_conditions = pd.merge(date_range_df, external_conditions, on='date', how='left')

# Fonction qui fait ce qu'on voulait faire avec ffill et bfill mais a la place prends la valeur la plus proche
def fill_closest_value_all_columns(df):
    """Fill NaN values with the closest value for all numeric columns in the DataFrame."""
    filled_df = df.copy()
    
    for column in filled_df.columns:
        if filled_df[column].dtype.kind in 'biufc':  # Numeric columns
            non_nan_values = filled_df[column].dropna()
            
            def find_closest(value):
                if pd.isna(value):
                    closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]
                    return closest_value
                return value
            
            filled_df[column] = filled_df[column].apply(find_closest)
    
    return filled_df

# Apply the function to the DataFrame
filled_external_conditions = fill_closest_value_all_columns(full_external_conditions)

In [None]:
# Merge the DataFrames
merged_conditions = pd.merge(data, filled_external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)


merged_conditions_test = pd.merge(data_test, filled_external_conditions, on='date', how='left')

merged_conditions_test = utils._column_rename(merged_conditions_test)

In [None]:
# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df["is_school_holiday"] = df["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df["is_public_holiday"] = df["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df["is_public_holiday"] = 0

# Ensure "date" is in datetime format
merged_conditions_test["date"] = pd.to_datetime(merged_conditions_test["date"], errors="coerce")

# Drop rows with invalid datetime entries
df_test = merged_conditions_test.dropna(subset=["date"])

# Extract date and time features
df_test["year"] = df_test["date"].dt.year
df_test["month"] = df_test["date"].dt.month
df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["day"] = df_test["date"].dt.day
df_test["hour"] = df_test["date"].dt.hour
df_test["is_weekend"] = (df_test["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df_test["date"].dt.date.unique()
d = SchoolHolidayDates()
s = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df_test["is_school_holiday"] = df_test["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df_test["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df_test["is_public_holiday"] = df_test["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df_test["is_public_holiday"] = 0

In [None]:
start_date_Monpar = "2021-01-25"
end_date_Monpar = "2021-02-23"
start_date_Clichy_NO_SE = "2021-04-09"
end_date_Clichy = "2021-07-20"
start_date_Clichy_SE_NO = "2021-03-23"
start_date_Pompidou = "2021-03-13"
end_date_Pompidou = "2021-04-01"

df["road_work_Monpar_O_E"] = np.where(
    (data["date"] >= start_date_Monpar)
    & (data["date"] <= end_date_Monpar)
    & (data["counter_name"] == "152 boulevard du Montparnasse O-E"),
    1,
    0,
)
df["road_work_Monpar_E_O"] = np.where(
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse E-O"),
    1,
    0,
)
df["road_work_Clichy_NO_SE"] = np.where(
    (df["date"] >= start_date_Clichy_NO_SE)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy NO-SE"),
    1,
    0,
)
df["road_work_Clichy_SE_NO"] = np.where(
    (df["date"] >= start_date_Clichy_SE_NO)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy SE-NO"),
    1,
    0,
)
df["road_work_Pompidou_NE_SO"] = np.where(
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou NE-SO"),
    1,
    0,
)
df["road_work_Pompidou_SO_NE"] = np.where(
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou SO-NE"),
    1,
    0,
)

df["road_work"] = (
    df["road_work_Monpar_E_O"]
    + df["road_work_Monpar_O_E"]
    + df["road_work_Clichy_NO_SE"]
    + df["road_work_Clichy_SE_NO"]
    + df["road_work_Pompidou_NE_SO"]
    + df["road_work_Pompidou_SO_NE"]
)
df.drop(
    [
        "road_work_Monpar_E_O",
        "road_work_Monpar_O_E",
        "road_work_Clichy_NO_SE",
        "road_work_Clichy_SE_NO",
        "road_work_Pompidou_NE_SO",
        "road_work_Pompidou_SO_NE",
    ],
    axis=1,
    inplace=True,
)

df["log_bike_count"][
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse E-O")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse O-E")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Clichy_NO_SE)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy NO-SE")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Clichy_SE_NO)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy SE-NO")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou NE-SO")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou SO-NE")
] = 0

df_test['road_work'] = 0

In [None]:
# Drop the columns from the df and df_test dataframes using the columns_to_drop list from the utils file
columns_to_drop = utils.columns_to_drop
df = df.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [None]:
TableReport(df)

In [None]:
TableReport(df_test)

## Test using flaml and the GPU

In [None]:
import pandas as pd
from flaml import AutoML
from skrub import TableVectorizer


# Preprocess the dataset
X = df.drop(columns=['log_bike_count', 'bike_count'])
y = df['log_bike_count']

# Split the data into training and validation sets based on the last 10% of dates
validation_split_index = int(len(df) * 0.9)
X_train, X_val = X.iloc[:validation_split_index], X.iloc[validation_split_index:]
y_train, y_val = y.iloc[:validation_split_index], y.iloc[validation_split_index:]

# Initialize the TableVectorizer
vectorizer = TableVectorizer()

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)



In [None]:
import optuna
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from ngboost import NGBRegressor
from h2o.automl import H2OAutoML
import h2o

# Initialize H2O
h2o.init()

# Dictionary to store the best parameters for each model
best_params = {}

# Define the objective function for Optuna
def objective(trial):
    # Model selection
    model_name = trial.suggest_categorical("model", ["RandomForest", "NGBoost", "H2OAutoML"])
    
    if model_name == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 50, 500)
        max_depth = trial.suggest_int("max_depth", 2, 32)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42,
        )
        model.fit(X_train_transformed, y_train)
        y_pred = model.predict(X_val_transformed)
    
    elif model_name == "NGBoost":
        learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
        n_estimators = trial.suggest_int("n_estimators", 50, 500)
        model = NGBRegressor(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            random_state=42,
        )
        model.fit(X_train_transformed, y_train)
        y_pred = model.predict(X_val_transformed)
    
    elif model_name == "H2OAutoML":
        # Convert datasets to H2O frames
        train = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
        val = h2o.H2OFrame(pd.concat([X_val, y_val], axis=1))
        
        # Specify predictors and response column
        predictors = X_train.columns.tolist()
        response = "log_bike_count"  # Update with your target column name
        
        # Run H2O AutoML
        automl = H2OAutoML(max_models=10, seed=42, nfolds=3)
        automl.train(x=predictors, y=response, training_frame=train)
        
        # Predict on validation set
        y_pred = automl.leader.predict(val).as_data_frame()["predict"].values

    # Compute the Mean Squared Error (MSE)
    mse = mean_squared_error(y_val, y_pred)
    return mse

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Get the best trial and parameters
best_trial = study.best_trial
best_model_params = study.best_params
print("Best Trial:", best_trial)
print("Best Model Parameters:", best_model_params)

# Save the best model
model_name = best_model_params["model"]
if model_name == "H2OAutoML":
    # Save H2O AutoML model
    automl.leader.save_mojo(f"best_{model_name}.mojo")
    print(f"Best H2O AutoML model saved as 'best_{model_name}.mojo'")
else:
    # Save sklearn or NGBoost models
    joblib.dump(best_model, f"best_{model_name}.joblib")
    print(f"Best model saved as 'best_{model_name}.joblib'")

# Shut down H2O
h2o.shutdown(prompt=False)


In [24]:
from skrub import TableVectorizer
from xgboost import XGBRegressor
import optuna
from sklearn.model_selection import train_test_split

# Preprocess the dataset using TableVectorizer
X = df.drop(columns=['log_bike_count', 'bike_count'])
y = df['log_bike_count']

# Split the data into training and validation sets based on the last 10% of dates
validation_split_index = int(len(df) * 0.9)
X_train, X_val = X.iloc[:validation_split_index], X.iloc[validation_split_index:]
y_train, y_val = y.iloc[:validation_split_index], y.iloc[validation_split_index:]

# Initialize the TableVectorizer
vectorizer = TableVectorizer()

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'df' is not defined

## Tune XGBoost using Optuna hyperparameter

In [None]:
import joblib
from xgboost import XGBRegressor
import optuna

# Define the objective function for Optuna
def objective(trial):
    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'random_state': 42,
        'tree_method': 'gpu_hist',  # Enable GPU support
        'predictor': 'gpu_predictor'
    }
    model = XGBRegressor(**param)
    model.fit(X_train_transformed, y_train)
    return model.score(X_val_transformed, y_val)  # Maximizing validation R² score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params

# Add GPU-specific parameter to the best parameters
best_params['tree_method'] = 'gpu_hist'  # Ensure GPU is used for the final model

# Save the best parameters to a file using joblib
joblib.dump(best_params, 'xg_boost_best_params.pkl')

# Train the final model with the best parameters
X_transformed = vectorizer.transform(X)  # Transform the entire dataset
final_model = XGBRegressor(**best_params)
final_model.fit(X_transformed, y)


## Tune LGBM using Optuna 

In [4]:
from skrub import TableVectorizer
# Split the data into training and validation sets based on the last 10% of dates
validation_split_index = int(len(X) * 0.9)
X_train, X_val = X.iloc[:validation_split_index], X.iloc[validation_split_index:]
y_train, y_val = y.iloc[:validation_split_index], y.iloc[validation_split_index:]

# Initialize the TableVectorizer
vectorizer = TableVectorizer()

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)



In [29]:
import joblib
from lightgbm import LGBMRegressor, early_stopping
import optuna
import pandas as pd

# Sanitize feature names
def sanitize_feature_names(df):
    """Sanitize feature names to remove special JSON characters."""
    df.columns = df.columns.str.replace(r"[^\w\d_]", "_", regex=True)
    return df

# Sanitize training and validation feature names
X_train_transformed = sanitize_feature_names(pd.DataFrame(X_train_transformed))
X_val_transformed = sanitize_feature_names(pd.DataFrame(X_val_transformed))

# Define the objective function for Optuna
def objective(trial):
    param = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', -1, 20),  # -1 means no limit
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42,
        'device': 'gpu',  # Enable GPU support
        'gpu_device_id': 0  # Use the first GPU device
    }
    model = LGBMRegressor(**param)
    model.fit(
        X_train_transformed, y_train,
        eval_set=[(X_val_transformed, y_val)],
        eval_metric='rmse',
        callbacks=[early_stopping(stopping_rounds=20)]
    )
    return model.best_score_['valid_0']['rmse']  # Minimize RMSE

# Create a study and optimize the objective function
study = optuna.create_study(direction='minimize')  # Minimizing RMSE
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params

# Add GPU-specific parameter to the best parameters
best_params['device'] = 'gpu'  # Ensure GPU is used for the final model

# Save the best parameters to a file using joblib
joblib.dump(best_params, 'lightgbm_best_params.pkl')

# Train the final model with the best parameters
X_transformed = sanitize_feature_names(pd.DataFrame(vectorizer.transform(X)))  # Transform the entire dataset
final_model = LGBMRegressor(**best_params)
final_model.fit(X_transformed, y)


[I 2024-12-10 16:09:28,238] A new study created in memory with name: no-name-0b64e342-7b3c-4600-a431-959cf57a0b8c


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014667 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:09:45,820] Trial 0 finished with value: 0.45359378700184444 and parameters: {'n_estimators': 369, 'learning_rate': 0.2208340478880343, 'max_depth': 18, 'num_leaves': 139, 'subsample': 0.830511140506035, 'colsample_bytree': 0.6800110619273798}. Best is trial 0 with value: 0.45359378700184444.


Early stopping, best iteration is:
[70]	valid_0's rmse: 0.453594	valid_0's l2: 0.205747
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014680 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:09:54,929] Trial 1 finished with value: 0.45073287904445947 and parameters: {'n_estimators': 376, 'learning_rate': 0.1969223885789928, 'max_depth': 5, 'num_leaves': 34, 'subsample': 0.9325185665746492, 'colsample_bytree': 0.5992427103446301}. Best is trial 1 with value: 0.45073287904445947.


Early stopping, best iteration is:
[343]	valid_0's rmse: 0.450733	valid_0's l2: 0.20316
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015524 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[205]	valid_0's rmse: 0.473439	valid_0's l2: 0.224144


[I 2024-12-10 16:10:00,289] Trial 2 finished with value: 0.4734386430047756 and parameters: {'n_estimators': 214, 'learning_rate': 0.21481057279535115, 'max_depth': 5, 'num_leaves': 148, 'subsample': 0.6852512328919058, 'colsample_bytree': 0.6116459796541494}. Best is trial 1 with value: 0.45073287904445947.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.027496 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:10:12,392] Trial 3 finished with value: 0.4485896393291048 and parameters: {'n_estimators': 345, 'learning_rate': 0.2345565390509233, 'max_depth': 11, 'num_leaves': 52, 'subsample': 0.8961533839432017, 'colsample_bytree': 0.766492558237776}. Best is trial 3 with value: 0.4485896393291048.


Early stopping, best iteration is:
[213]	valid_0's rmse: 0.44859	valid_0's l2: 0.201233
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.027326 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[555]	valid_0's rmse: 0.530451	valid_0's l2: 0.281378


[I 2024-12-10 16:10:17,940] Trial 4 finished with value: 0.5304508262219161 and parameters: {'n_estimators': 555, 'learning_rate': 0.0740805504758051, 'max_depth': 3, 'num_leaves': 244, 'subsample': 0.5890430453214333, 'colsample_bytree': 0.7242162740148423}. Best is trial 3 with value: 0.4485896393291048.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014423 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:10:33,292] Trial 5 finished with value: 0.43727828665253726 and parameters: {'n_estimators': 395, 'learning_rate': 0.11489415373265645, 'max_depth': 7, 'num_leaves': 86, 'subsample': 0.8105438481419864, 'colsample_bytree': 0.5487191830281595}. Best is trial 5 with value: 0.43727828665253726.


Early stopping, best iteration is:
[217]	valid_0's rmse: 0.437278	valid_0's l2: 0.191212
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.018899 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:10:59,562] Trial 6 finished with value: 0.44027618571501426 and parameters: {'n_estimators': 541, 'learning_rate': 0.1463195292308084, 'max_depth': 19, 'num_leaves': 235, 'subsample': 0.530519293388396, 'colsample_bytree': 0.5706035494958999}. Best is trial 5 with value: 0.43727828665253726.


Early stopping, best iteration is:
[98]	valid_0's rmse: 0.440276	valid_0's l2: 0.193843
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.026871 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:11:11,235] Trial 7 finished with value: 0.44601335156404004 and parameters: {'n_estimators': 386, 'learning_rate': 0.19988505118934313, 'max_depth': 20, 'num_leaves': 137, 'subsample': 0.8635221354348859, 'colsample_bytree': 0.8931212495330091}. Best is trial 5 with value: 0.43727828665253726.


Early stopping, best iteration is:
[72]	valid_0's rmse: 0.446013	valid_0's l2: 0.198928
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.030705 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:11:38,041] Trial 8 finished with value: 0.428708307372199 and parameters: {'n_estimators': 400, 'learning_rate': 0.136947819905667, 'max_depth': 20, 'num_leaves': 222, 'subsample': 0.636464661519703, 'colsample_bytree': 0.7169816105569158}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[114]	valid_0's rmse: 0.428708	valid_0's l2: 0.183791
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.026303 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:12:08,992] Trial 9 finished with value: 0.44087934848456606 and parameters: {'n_estimators': 387, 'learning_rate': 0.05926537079532547, 'max_depth': 19, 'num_leaves': 84, 'subsample': 0.5492607309783288, 'colsample_bytree': 0.7409238546802083}. Best is trial 8 with value: 0.428708307372199.


Did not meet early stopping. Best iteration is:
[387]	valid_0's rmse: 0.440879	valid_0's l2: 0.194375
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015854 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:13:28,186] Trial 10 finished with value: 0.4311229848811421 and parameters: {'n_estimators': 477, 'learning_rate': 0.024140138698065555, 'max_depth': 13, 'num_leaves': 204, 'subsample': 0.7003982187598712, 'colsample_bytree': 0.9949959258499431}. Best is trial 8 with value: 0.428708307372199.


Did not meet early stopping. Best iteration is:
[469]	valid_0's rmse: 0.431123	valid_0's l2: 0.185867
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.045994 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:13:44,768] Trial 11 finished with value: 0.4361486524899983 and parameters: {'n_estimators': 483, 'learning_rate': 0.29969910446283865, 'max_depth': 13, 'num_leaves': 203, 'subsample': 0.687899374670739, 'colsample_bytree': 0.9965717915641733}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[69]	valid_0's rmse: 0.436149	valid_0's l2: 0.190226
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015811 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:15:03,746] Trial 12 finished with value: 0.44417602745840973 and parameters: {'n_estimators': 476, 'learning_rate': 0.010631650691572209, 'max_depth': -1, 'num_leaves': 190, 'subsample': 0.6472705722160343, 'colsample_bytree': 0.8340983084503006}. Best is trial 8 with value: 0.428708307372199.


Did not meet early stopping. Best iteration is:
[476]	valid_0's rmse: 0.444176	valid_0's l2: 0.197292
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015330 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:15:54,864] Trial 13 finished with value: 0.4396744776577704 and parameters: {'n_estimators': 304, 'learning_rate': 0.020294355454947025, 'max_depth': 15, 'num_leaves': 191, 'subsample': 0.7504209654285144, 'colsample_bytree': 0.97745585412235}. Best is trial 8 with value: 0.428708307372199.


Did not meet early stopping. Best iteration is:
[304]	valid_0's rmse: 0.439674	valid_0's l2: 0.193314
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014621 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:16:10,795] Trial 14 finished with value: 0.43735924479347654 and parameters: {'n_estimators': 449, 'learning_rate': 0.09838248902911002, 'max_depth': 16, 'num_leaves': 254, 'subsample': 0.7599605343148963, 'colsample_bytree': 0.8527471455962793}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[49]	valid_0's rmse: 0.437359	valid_0's l2: 0.191283
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015209 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:16:28,795] Trial 15 finished with value: 0.4461094852777703 and parameters: {'n_estimators': 300, 'learning_rate': 0.1543893714996548, 'max_depth': 10, 'num_leaves': 217, 'subsample': 0.6162739817276087, 'colsample_bytree': 0.6624420943129763}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[83]	valid_0's rmse: 0.446109	valid_0's l2: 0.199014
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015890 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:17:26,881] Trial 16 finished with value: 0.43139491622385645 and parameters: {'n_estimators': 596, 'learning_rate': 0.048963002913494216, 'max_depth': 14, 'num_leaves': 162, 'subsample': 0.6947583821762007, 'colsample_bytree': 0.909096048968972}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[403]	valid_0's rmse: 0.431395	valid_0's l2: 0.186102
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015088 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:17:29,064] Trial 17 finished with value: 0.45210781242549064 and parameters: {'n_estimators': 441, 'learning_rate': 0.2758998988429598, 'max_depth': 12, 'num_leaves': 165, 'subsample': 0.9829265326447192, 'colsample_bytree': 0.7835782351532596}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[52]	valid_0's rmse: 0.452108	valid_0's l2: 0.204401
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014707 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:17:32,632] Trial 18 finished with value: 0.43436539228373383 and parameters: {'n_estimators': 494, 'learning_rate': 0.09784934483222275, 'max_depth': 17, 'num_leaves': 220, 'subsample': 0.5789239162301076, 'colsample_bytree': 0.5072609174328567}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[103]	valid_0's rmse: 0.434365	valid_0's l2: 0.188673
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.028811 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:17:36,008] Trial 19 finished with value: 0.4360122424200332 and parameters: {'n_estimators': 527, 'learning_rate': 0.1321460070294395, 'max_depth': 9, 'num_leaves': 112, 'subsample': 0.643105489913072, 'colsample_bytree': 0.9437827736629967}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[184]	valid_0's rmse: 0.436012	valid_0's l2: 0.190107
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014293 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:17:38,462] Trial 20 finished with value: 0.4462078430462362 and parameters: {'n_estimators': 434, 'learning_rate': 0.1738737272404475, 'max_depth': 16, 'num_leaves': 177, 'subsample': 0.7253894778700837, 'colsample_bytree': 0.8212575083727616}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[58]	valid_0's rmse: 0.446208	valid_0's l2: 0.199101
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.013359 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:17:44,884] Trial 21 finished with value: 0.42894227324963385 and parameters: {'n_estimators': 600, 'learning_rate': 0.05020091209554661, 'max_depth': 13, 'num_leaves': 170, 'subsample': 0.6821896578642082, 'colsample_bytree': 0.9194499141825921}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[261]	valid_0's rmse: 0.428942	valid_0's l2: 0.183991
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014665 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[494]	valid_0's rmse: 0.434518	valid_0's l2: 0.188806


[I 2024-12-10 16:17:55,788] Trial 22 finished with value: 0.4345184658826408 and parameters: {'n_estimators': 573, 'learning_rate': 0.03715045384568465, 'max_depth': 9, 'num_leaves': 214, 'subsample': 0.5027657729074598, 'colsample_bytree': 0.9355182734014026}. Best is trial 8 with value: 0.428708307372199.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015100 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:05,201] Trial 23 finished with value: 0.4356871824719582 and parameters: {'n_estimators': 505, 'learning_rate': 0.06435650589902286, 'max_depth': 13, 'num_leaves': 232, 'subsample': 0.7722361097137489, 'colsample_bytree': 0.8753870030100093}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[342]	valid_0's rmse: 0.435687	valid_0's l2: 0.189823
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.013781 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:10,079] Trial 24 finished with value: 0.43190784560878337 and parameters: {'n_estimators': 333, 'learning_rate': 0.07957788992719265, 'max_depth': 15, 'num_leaves': 189, 'subsample': 0.6301085319917695, 'colsample_bytree': 0.9542962588275462}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[176]	valid_0's rmse: 0.431908	valid_0's l2: 0.186544
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.013366 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[426]	valid_0's rmse: 0.455995	valid_0's l2: 0.207931


[I 2024-12-10 16:18:16,054] Trial 25 finished with value: 0.4559945440239825 and parameters: {'n_estimators': 426, 'learning_rate': 0.03165814385143289, 'max_depth': 7, 'num_leaves': 203, 'subsample': 0.7135397100813408, 'colsample_bytree': 0.69383419275515}. Best is trial 8 with value: 0.428708307372199.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014182 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:19,352] Trial 26 finished with value: 0.43709482928184307 and parameters: {'n_estimators': 593, 'learning_rate': 0.10306948367529832, 'max_depth': 11, 'num_leaves': 120, 'subsample': 0.6672627407436593, 'colsample_bytree': 0.7940676721592248}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[152]	valid_0's rmse: 0.437095	valid_0's l2: 0.191052
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015256 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:24,128] Trial 27 finished with value: 0.4363374125295098 and parameters: {'n_estimators': 228, 'learning_rate': 0.08495589263750544, 'max_depth': 17, 'num_leaves': 167, 'subsample': 0.7959982011756617, 'colsample_bytree': 0.924833896607552}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[180]	valid_0's rmse: 0.436337	valid_0's l2: 0.19039
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015255 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:29,768] Trial 28 finished with value: 0.4340103933826887 and parameters: {'n_estimators': 522, 'learning_rate': 0.12689713471415598, 'max_depth': 14, 'num_leaves': 228, 'subsample': 0.595445724092812, 'colsample_bytree': 0.9809776214877952}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[192]	valid_0's rmse: 0.43401	valid_0's l2: 0.188365
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.080131 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:38,144] Trial 29 finished with value: 0.43505555985007066 and parameters: {'n_estimators': 460, 'learning_rate': 0.042143602595087926, 'max_depth': 18, 'num_leaves': 140, 'subsample': 0.7211212452711645, 'colsample_bytree': 0.6600705595490783}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[301]	valid_0's rmse: 0.435056	valid_0's l2: 0.189273
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015783 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:42,870] Trial 30 finished with value: 0.44527905401678597 and parameters: {'n_estimators': 254, 'learning_rate': 0.1748958191369569, 'max_depth': 20, 'num_leaves': 255, 'subsample': 0.6540868120484442, 'colsample_bytree': 0.7074157555621103}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[135]	valid_0's rmse: 0.445279	valid_0's l2: 0.198273
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015789 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:18:50,559] Trial 31 finished with value: 0.430529192373979 and parameters: {'n_estimators': 584, 'learning_rate': 0.052250283712731244, 'max_depth': 12, 'num_leaves': 153, 'subsample': 0.697103688934882, 'colsample_bytree': 0.9008392908241647}. Best is trial 8 with value: 0.428708307372199.


Early stopping, best iteration is:
[394]	valid_0's rmse: 0.430529	valid_0's l2: 0.185355
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014737 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:02,619] Trial 32 finished with value: 0.4330836895163855 and parameters: {'n_estimators': 560, 'learning_rate': 0.02436369221738237, 'max_depth': 12, 'num_leaves': 177, 'subsample': 0.7368869384248554, 'colsample_bytree': 0.8749763920765843}. Best is trial 8 with value: 0.428708307372199.


Did not meet early stopping. Best iteration is:
[559]	valid_0's rmse: 0.433084	valid_0's l2: 0.187561
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.025691 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[411]	valid_0's rmse: 0.461468	valid_0's l2: 0.212953


[I 2024-12-10 16:19:08,138] Trial 33 finished with value: 0.4614681908724781 and parameters: {'n_estimators': 413, 'learning_rate': 0.0522999116102424, 'max_depth': 7, 'num_leaves': 153, 'subsample': 0.6746359521804649, 'colsample_bytree': 0.9684340776738607}. Best is trial 8 with value: 0.428708307372199.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014831 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:13,721] Trial 34 finished with value: 0.4279859221494695 and parameters: {'n_estimators': 356, 'learning_rate': 0.06096880786868278, 'max_depth': 10, 'num_leaves': 203, 'subsample': 0.8389519986730779, 'colsample_bytree': 0.6293816639977194}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[209]	valid_0's rmse: 0.427986	valid_0's l2: 0.183172
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015477 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[350]	valid_0's rmse: 0.477576	valid_0's l2: 0.228079


[I 2024-12-10 16:19:16,721] Trial 35 finished with value: 0.4775759191318391 and parameters: {'n_estimators': 356, 'learning_rate': 0.07124197002277603, 'max_depth': 5, 'num_leaves': 127, 'subsample': 0.851908174872778, 'colsample_bytree': 0.6553725203482947}. Best is trial 34 with value: 0.4279859221494695.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015077 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:21,712] Trial 36 finished with value: 0.432882304921706 and parameters: {'n_estimators': 325, 'learning_rate': 0.08934926420050518, 'max_depth': 10, 'num_leaves': 104, 'subsample': 0.9001144580766013, 'colsample_bytree': 0.6400430746282325}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[304]	valid_0's rmse: 0.432882	valid_0's l2: 0.187387
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015472 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[162]	valid_0's rmse: 0.438117	valid_0's l2: 0.191947


[I 2024-12-10 16:19:25,515] Trial 37 finished with value: 0.43811711179887713 and parameters: {'n_estimators': 372, 'learning_rate': 0.11518585314592117, 'max_depth': 8, 'num_leaves': 181, 'subsample': 0.7927392023767832, 'colsample_bytree': 0.628925162413305}. Best is trial 34 with value: 0.4279859221494695.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015389 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:27,404] Trial 38 finished with value: 0.5068070258656927 and parameters: {'n_estimators': 271, 'learning_rate': 0.2491524246937203, 'max_depth': 3, 'num_leaves': 149, 'subsample': 0.8242778652564189, 'colsample_bytree': 0.5727440325233164}. Best is trial 34 with value: 0.4279859221494695.


Did not meet early stopping. Best iteration is:
[270]	valid_0's rmse: 0.506807	valid_0's l2: 0.256853
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014636 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:29,857] Trial 39 finished with value: 0.5007148482285305 and parameters: {'n_estimators': 407, 'learning_rate': 0.17659119374683455, 'max_depth': 3, 'num_leaves': 49, 'subsample': 0.9229778668324684, 'colsample_bytree': 0.7230155958841847}. Best is trial 34 with value: 0.4279859221494695.


Did not meet early stopping. Best iteration is:
[407]	valid_0's rmse: 0.500715	valid_0's l2: 0.250715
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015613 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:34,760] Trial 40 finished with value: 0.43187807134035544 and parameters: {'n_estimators': 365, 'learning_rate': 0.06264039611013436, 'max_depth': 11, 'num_leaves': 238, 'subsample': 0.6058227583129017, 'colsample_bytree': 0.7502585708546834}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[145]	valid_0's rmse: 0.431878	valid_0's l2: 0.186519
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015145 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:48,487] Trial 41 finished with value: 0.4405872973376226 and parameters: {'n_estimators': 570, 'learning_rate': 0.011422616201196369, 'max_depth': 12, 'num_leaves': 204, 'subsample': 0.7074334884611093, 'colsample_bytree': 0.6067426717698351}. Best is trial 34 with value: 0.4279859221494695.


Did not meet early stopping. Best iteration is:
[570]	valid_0's rmse: 0.440587	valid_0's l2: 0.194117
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014585 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:19:57,271] Trial 42 finished with value: 0.42976505373326435 and parameters: {'n_estimators': 542, 'learning_rate': 0.04264290282031737, 'max_depth': 13, 'num_leaves': 208, 'subsample': 0.5718384863858343, 'colsample_bytree': 0.9065989343448012}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[337]	valid_0's rmse: 0.429765	valid_0's l2: 0.184698
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015293 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:20:04,999] Trial 43 finished with value: 0.4326800678240113 and parameters: {'n_estimators': 543, 'learning_rate': 0.042192953123003765, 'max_depth': 14, 'num_leaves': 158, 'subsample': 0.5609943589911243, 'colsample_bytree': 0.8960234701038211}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[341]	valid_0's rmse: 0.43268	valid_0's l2: 0.187212
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014638 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:20:10,962] Trial 44 finished with value: 0.43061130469844106 and parameters: {'n_estimators': 581, 'learning_rate': 0.07417848798921751, 'max_depth': 10, 'num_leaves': 199, 'subsample': 0.5331039050666724, 'colsample_bytree': 0.8391841418200008}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[250]	valid_0's rmse: 0.430611	valid_0's l2: 0.185426
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015686 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:20:18,236] Trial 45 finished with value: 0.4304928490221279 and parameters: {'n_estimators': 548, 'learning_rate': 0.055053505457930724, 'max_depth': -1, 'num_leaves': 220, 'subsample': 0.5750510175136737, 'colsample_bytree': 0.811252377096204}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[226]	valid_0's rmse: 0.430493	valid_0's l2: 0.185324
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015302 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds


[I 2024-12-10 16:20:22,160] Trial 46 finished with value: 0.4390401744257412 and parameters: {'n_estimators': 543, 'learning_rate': 0.21330269863049364, 'max_depth': 0, 'num_leaves': 223, 'subsample': 0.5727495147840057, 'colsample_bytree': 0.7764688084627359}. Best is trial 34 with value: 0.4279859221494695.


Early stopping, best iteration is:
[110]	valid_0's rmse: 0.43904	valid_0's l2: 0.192756
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.015167 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[343]	valid_0's rmse: 0.466113	valid_0's l2: 0.217261


[I 2024-12-10 16:20:25,201] Trial 47 finished with value: 0.46611257742530077 and parameters: {'n_estimators': 520, 'learning_rate': 0.14106953894747815, 'max_depth': 5, 'num_leaves': 240, 'subsample': 0.6197580182149561, 'colsample_bytree': 0.8146865515249178}. Best is trial 34 with value: 0.4279859221494695.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014861 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[546]	valid_0's rmse: 0.668088	valid_0's l2: 0.446341


[I 2024-12-10 16:20:27,378] Trial 48 finished with value: 0.6680875310675016 and parameters: {'n_estimators': 556, 'learning_rate': 0.11553543106124825, 'max_depth': 1, 'num_leaves': 211, 'subsample': 0.5113693292289697, 'colsample_bytree': 0.8754417413432651}. Best is trial 34 with value: 0.4279859221494695.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2828
[LightGBM] [Info] Number of data points in the train set: 447144, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (18.76 MB) transferred to GPU in 0.014302 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.042398
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[385]	valid_0's rmse: 0.494993	valid_0's l2: 0.245018


[I 2024-12-10 16:20:31,475] Trial 49 finished with value: 0.4949930214792919 and parameters: {'n_estimators': 385, 'learning_rate': 0.029275960993832596, 'max_depth': 6, 'num_leaves': 194, 'subsample': 0.5457311139005518, 'colsample_bytree': 0.7592413542395129}. Best is trial 34 with value: 0.4279859221494695.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 496827, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (20.85 MB) transferred to GPU in 0.016656 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.075693




## Tune SVR with Optuna

In [None]:
import joblib
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
import optuna
import pandas as pd
import numpy as np

# Sanitize feature names
def sanitize_feature_names(df):
    """Sanitize feature names to remove special JSON characters."""
    df.columns = df.columns.str.replace(r"[^\w\d_]", "_", regex=True)
    return df

# Sanitize training and validation feature names
X_train_transformed = sanitize_feature_names(pd.DataFrame(X_train_transformed))
X_val_transformed = sanitize_feature_names(pd.DataFrame(X_val_transformed))

# Standardize the features for SVR
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_val_scaled = scaler.transform(X_val_transformed)

# Define the objective function for Optuna
def objective(trial):
    param = {
        'C': trial.suggest_float('C', 0.1, 100.0, log=True),
        'epsilon': trial.suggest_float('epsilon', 0.01, 1.0, log=True),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
        'degree': trial.suggest_int('degree', 2, 5) if trial.params.get('kernel') == 'poly' else 3
    }
    model = SVR(**param)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    rmse = root_mean_squared_error(y_val, y_pred, squared=False)
    return rmse  # Minimize RMSE

# Create a study and optimize the objective function
study = optuna.create_study(direction='minimize')  # Minimizing RMSE
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params

# Save the best parameters to a file using joblib
joblib.dump(best_params, 'svr_best_params.pkl')

# Train the final model with the best parameters
X_transformed = sanitize_feature_names(pd.DataFrame(vectorizer.transform(X)))  # Transform the entire dataset
X_scaled = scaler.transform(X_transformed)  # Scale the entire dataset
final_model = SVR(**best_params)
final_model.fit(X_scaled, y)


[I 2024-12-11 00:25:35,932] A new study created in memory with name: no-name-6f93f83b-d585-4e92-81a9-f7f746e32e0e


In [None]:
! git add svr_best_params.pkl
! git commit -m "SVR best parameters"
! git push origin main

In [7]:
import joblib
from lightgbm import LGBMRegressor
from skrub import TableVectorizer

# Function to sanitize feature names
def sanitize_feature_names(df):
    """Sanitize feature names to remove special JSON characters."""
    df.columns = df.columns.str.replace(r"[^\w\d_]", "_", regex=True)
    return df

# Load the best parameters from the pickle file
best_params = joblib.load('lightgbm_best_params.pkl')

# Initialize the TableVectorizer
vectorizer = TableVectorizer()

# Fit and transform the data
X_transformed = vectorizer.fit_transform(X)

# Sanitize feature names in the transformed data
X_transformed = sanitize_feature_names(X_transformed)

# Train the final model with the best parameters
final_model = LGBMRegressor(**best_params)
final_model.fit(X_transformed, y)

# Print model parameters
print("Trained model parameters:")
print(final_model.get_params())


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2723
[LightGBM] [Info] Number of data points in the train set: 496827, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A5000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (20.85 MB) transferred to GPU in 0.028990 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 3.075693
Trained model parameters:
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.6293816639977194, 'importance_type': 'split', 'learning_rate': 0.06096880786868278, 'max_depth': 10, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 356, 'n_jobs': None, 'num_leaves': 203, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.8389

In [8]:
# Transform the test data using the same vectorizer instance
X_test_transformed = vectorizer.transform(X_test)

# Make predictions
y_pred = final_model.predict(X_test_transformed)

print("Predictions:", y_pred)




Predictions: [0.41292906 1.53518471 2.14262788 ... 5.41643063 4.85195188 4.04859703]


In [9]:
df_submission = pd.DataFrame(y_pred, columns=["log_bike_count"])
df_submission.index = X_test.index
df_submission.index.name = "Id"
df_submission.to_csv("test_pipeline_lgbm.csv", index=True)

In [None]:
test_data = pd.read_parquet('data/final_test.parquet')
# Merge the DataFrames
merged_conditions = pd.merge(test_data, filled_external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)

# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df_test = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df_test["year"] = df_test["date"].dt.year
df_test["month"] = df_test["date"].dt.month
df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["day"] = df_test["date"].dt.day
df_test["hour"] = df_test["date"].dt.hour
df_test["is_weekend"] = (df_test["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates_test = df_test["date"].dt.date.unique()

try:
    dict_school_holidays_test = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates_test}
    df_test["is_school_holiday"] = df_test["date"].dt.date.map(dict_school_holidays_test).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df_test["is_school_holiday"] = 0

try:
    dict_public_holidays_test = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates_test}
    df_test["is_public_holiday"] = df_test["date"].dt.date.map(dict_public_holidays_test).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df_test["is_public_holiday"] = 0

# Predict using the pipeline
y_pred_test = pipeline.predict(df_test)

In [None]:
df_submission = pd.DataFrame(y_pred_test, columns=["log_bike_count"])
df_submission.index.name = "Id"
df_submission
df_submission.to_csv("/Users/felix/Documents/X/Cours Python/Kaggle/submission/test_pipeline.csv", index=True)

## En dessous c'est des tests d'avant ca ne fait pas tourner ce qui marche actuellement

In [None]:
# Add the new category to categorical columns
for col in df.select_dtypes(include=['category']).columns:
	df[col] = df[col].cat.add_categories([0])

# Fill NaN values with 0
df = df.fillna(0)

In [None]:
y_train = df['log_bike_count'].values
X_train = df.drop(['log_bike_count', "bike_count"], axis=1)

date_cols = ["year", "month", "weekday", "day", "hour", "is_weekend", "is_school_holiday", "is_public_holiday"]
categorical_cols = ["counter_name"]
numerical_cols = [
    'latitude', 'longitude', 'Sea Level Pressure (hPa)', 'Pressure Tendency (hPa/3h)',
    'Pressure Tendency Code', 'Wind Direction (°)', 'Wind Speed (m/s)', 'Air Temperature (°C)',
    'Dew Point Temperature (°C)', 'Relative Humidity (%)', 'Visibility (m)', 'Present Weather Code',
    'Past Weather Code 1', 'Past Weather Code 2', 'Total Cloud Cover (oktas)', 'Cloud Base Height (m)',
    'Lowest Cloud Base Height (m)', 'Low Cloud Type', 'Station Level Pressure (hPa)', '24h Pressure Tendency (hPa)',
    '10min Max Wind Gust (m/s)', 'Max Wind Gust (m/s)', 'Measurement Period Duration', 'Ground State',
    'Snow Height (cm)', 'New Snow Depth (cm)', 'New Snowfall Duration (hours)', 'Rainfall (1h, mm)',
    'Rainfall (3h, mm)', 'Rainfall (6h, mm)', 'Rainfall (12h, mm)', 'Rainfall (24h, mm)',
    'Layer 1 Cloud Cover (oktas)', 'Layer 1 Cloud Type', 'Layer 1 Cloud Base Height (m)'
]


# 1. Apply column transformations
# One-hot encode date columns
date_encoder = OneHotEncoder(handle_unknown="ignore")
date_encoded = date_encoder.fit_transform(X_train[date_cols])

# One-hot encode categorical columns
cat_encoder = OneHotEncoder(handle_unknown="ignore")
cat_encoded = cat_encoder.fit_transform(X_train[categorical_cols])

# Standard scale numerical columns
num_scaler = StandardScaler()
num_scaled = num_scaler.fit_transform(X_train[numerical_cols])

X_transformed = hstack([date_encoded, cat_encoded, num_scaled]).toarray()

# 2. Train the model
model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_transformed, y_train)



In [None]:
X_test = utils.get_test_data()

In [None]:
# Merge the DataFrames
merged_conditions = pd.merge(X_test, external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)
# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df["is_school_holiday"] = df["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df["is_public_holiday"] = df["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df["is_public_holiday"] = 0

In [None]:
# Process the test data with the same transformations as the training data
# 1. Apply column transformations
# One-hot encode date columns
date_encoded_test = date_encoder.transform(df[date_cols])

# One-hot encode categorical columns
cat_encoded_test = cat_encoder.transform(df[categorical_cols])

# Standard scale numerical columns
num_scaled_test = num_scaler.transform(df[numerical_cols])

# Combine all transformed features
X_test_transformed_numeric = hstack([date_encoded_test, cat_encoded_test, num_scaled_test]).toarray()

# 2. Make predictions
y_pred = model.predict(X_test_transformed_numeric)


In [None]:
df_submission = pd.DataFrame(y_pred, columns=["log_bike_count"])
df_submission.index.name = "Id"

In [None]:
df_submission.to_csv("/Users/felix/Documents/X/Cours Python/Kaggle/submission/test_pipeline.csv", index=True)