In [8]:
import utils
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack
import pandas as pd
from skrub import TableReport
from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates
import numpy as np

In [2]:
data = pd.read_parquet("data/train.parquet")
# Sort by date first, so that time based cross-validation would produce correct results
data = data.sort_values(["date", "counter_name"])

data_test = pd.read_parquet("data/final_test.parquet")
# Sort by date first, so that time based cross-validation would produce correct results
data_test = data_test.sort_values(["date", "counter_name"])


In [3]:
external_conditions = pd.read_csv('data/external_data.csv')
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

In [4]:
# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

In [5]:
# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 1: Sort the `external_conditions` DataFrame by the `date` column
external_conditions = external_conditions.sort_values(by='date')

# Drop columns with more than 40% NaN values
threshold = len(external_conditions) * 0.4
external_conditions = external_conditions.dropna(thresh=threshold, axis=1)

# Step 2: Remove duplicate entries based on the `date` column
external_conditions = external_conditions.drop_duplicates(subset='date')

# Step 3: Convert the 'date' column to datetime
external_conditions['date'] = pd.to_datetime(external_conditions['date'])

# Step 4: Create a complete date range from the minimum to the maximum date in the DataFrame
date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')

# Step 5: Create a DataFrame from the date_range
date_range_df = pd.DataFrame(date_range, columns=['date'])

# Step 6: Merge the date_range DataFrame with the external_conditions DataFrame on the 'date' column
full_external_conditions = pd.merge(date_range_df, external_conditions, on='date', how='left')

# Fonction qui fait ce qu'on voulait faire avec ffill et bfill mais a la place prends la valeur la plus proche
def fill_closest_value_all_columns(df):
    """Fill NaN values with the closest value for all numeric columns in the DataFrame."""
    filled_df = df.copy()
    
    for column in filled_df.columns:
        if filled_df[column].dtype.kind in 'biufc':  # Numeric columns
            non_nan_values = filled_df[column].dropna()
            
            def find_closest(value):
                if pd.isna(value):
                    closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]
                    return closest_value
                return value
            
            filled_df[column] = filled_df[column].apply(find_closest)
    
    return filled_df

# Apply the function to the DataFrame
filled_external_conditions = fill_closest_value_all_columns(full_external_conditions)

  date_range = pd.date_range(start=external_conditions['date'].min(), end=external_conditions['date'].max(), freq='H')
  closest_value = non_nan_values.iloc[(non_nan_values - value).abs().argmin()]


In [6]:
# Merge the DataFrames
merged_conditions = pd.merge(data, filled_external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)


merged_conditions_test = pd.merge(data_test, filled_external_conditions, on='date', how='left')

merged_conditions_test = utils._column_rename(merged_conditions_test)

In [7]:
# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df["is_school_holiday"] = df["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df["is_public_holiday"] = df["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df["is_public_holiday"] = 0

# Ensure "date" is in datetime format
merged_conditions_test["date"] = pd.to_datetime(merged_conditions_test["date"], errors="coerce")

# Drop rows with invalid datetime entries
df_test = merged_conditions_test.dropna(subset=["date"])

# Extract date and time features
df_test["year"] = df_test["date"].dt.year
df_test["month"] = df_test["date"].dt.month
df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["day"] = df_test["date"].dt.day
df_test["hour"] = df_test["date"].dt.hour
df_test["is_weekend"] = (df_test["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df_test["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df_test["is_school_holiday"] = df_test["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df_test["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df_test["is_public_holiday"] = df_test["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df_test["is_public_holiday"] = 0

In [9]:
start_date_Monpar = "2021-01-25"
end_date_Monpar = "2021-02-23"
start_date_Clichy_NO_SE = "2021-04-09"
end_date_Clichy = "2021-07-20"
start_date_Clichy_SE_NO = "2021-03-23"
start_date_Pompidou = "2021-03-13"
end_date_Pompidou = "2021-04-01"

df["road_work_Monpar_O_E"] = np.where(
    (data["date"] >= start_date_Monpar)
    & (data["date"] <= end_date_Monpar)
    & (data["counter_name"] == "152 boulevard du Montparnasse O-E"),
    1,
    0,
)
df["road_work_Monpar_E_O"] = np.where(
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse E-O"),
    1,
    0,
)
df["road_work_Clichy_NO_SE"] = np.where(
    (df["date"] >= start_date_Clichy_NO_SE)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy NO-SE"),
    1,
    0,
)
df["road_work_Clichy_SE_NO"] = np.where(
    (df["date"] >= start_date_Clichy_SE_NO)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy SE-NO"),
    1,
    0,
)
df["road_work_Pompidou_NE_SO"] = np.where(
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou NE-SO"),
    1,
    0,
)
df["road_work_Pompidou_SO_NE"] = np.where(
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou SO-NE"),
    1,
    0,
)

df["road_work"] = (
    df["road_work_Monpar_E_O"]
    + df["road_work_Monpar_O_E"]
    + df["road_work_Clichy_NO_SE"]
    + df["road_work_Clichy_SE_NO"]
    + df["road_work_Pompidou_NE_SO"]
    + df["road_work_Pompidou_SO_NE"]
)
df.drop(
    [
        "road_work_Monpar_E_O",
        "road_work_Monpar_O_E",
        "road_work_Clichy_NO_SE",
        "road_work_Clichy_SE_NO",
        "road_work_Pompidou_NE_SO",
        "road_work_Pompidou_SO_NE",
    ],
    axis=1,
    inplace=True,
)

df["log_bike_count"][
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse E-O")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Monpar)
    & (df["date"] <= end_date_Monpar)
    & (df["counter_name"] == "152 boulevard du Montparnasse O-E")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Clichy_NO_SE)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy NO-SE")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Clichy_SE_NO)
    & (df["date"] <= end_date_Clichy)
    & (df["counter_name"] == "20 Avenue de Clichy SE-NO")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou NE-SO")
] = 0
df["log_bike_count"][
    (df["date"] >= start_date_Pompidou)
    & (df["date"] <= end_date_Pompidou)
    & (df["counter_name"] == "Voie Georges Pompidou SO-NE")
] = 0

df_test['road_work'] = 0

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["log_bike_count"][
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["log_bike_count"][
You are setting val

In [10]:
# Drop the columns from the df and df_test dataframes using the columns_to_drop list from the utils file
columns_to_drop = utils.columns_to_drop
df = df.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [11]:
TableReport(df)

Processing column  23 / 23


Unnamed: 0_level_0,counter_name,bike_count,latitude,longitude,log_bike_count,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday,road_work
Unnamed: 0_level_1,counter_name,bike_count,latitude,longitude,log_bike_count,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday,road_work
0.0,152 boulevard du Montparnasse E-O,4.0,48.840801,2.333233,1.6094379124341005,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1.0,152 boulevard du Montparnasse O-E,3.0,48.840801,2.333233,1.3862943611198906,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2.0,18 quai de l'Hôtel de Ville NO-SE,0.0,48.85372,2.35702,0.0,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
3.0,18 quai de l'Hôtel de Ville SE-NO,1.0,48.85372,2.35702,0.6931471805599453,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4.0,20 Avenue de Clichy NO-SE,7.0,48.88529,2.32666,2.079441541679836,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2020.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
,,,,,,,,,,,,,,,,,,,,,,,
496822.0,Totem 85 quai d'Austerlitz SE-NO,42.0,48.84201,2.36729,3.7612001156935615,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0,0.0
496823.0,Totem Cours la Reine E-O,22.0,48.86462,2.31444,3.1354942159291497,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0,0.0
496824.0,Totem Cours la Reine O-E,32.0,48.86462,2.31444,3.49650756146648,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0,0.0
496825.0,Voie Georges Pompidou NE-SO,9.0,48.8484,2.27586,2.302585092994046,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,3.0,9.0,23.0,0.0,0.0,0.0,0.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_name,CategoricalDtype,0 (0.0%),56 (< 0.1%),,,,,
1,bike_count,Float64DType,0 (0.0%),998 (0.2%),60.2,87.6,0.0,29.0,1300.0
2,latitude,Float64DType,0 (0.0%),30 (< 0.1%),48.9,0.0186,48.8,48.9,48.9
3,longitude,Float64DType,0 (0.0%),30 (< 0.1%),2.35,0.038,2.27,2.35,2.41
4,log_bike_count,Float64DType,0 (0.0%),998 (0.2%),3.08,1.66,0.0,3.4,7.17
5,Pressure_Tendency_(hPa/3h),Float64DType,0 (0.0%),97 (< 0.1%),-40.1,75.2,-750.0,-60.0,620.0
6,Wind_Speed_(m/s),Float64DType,0 (0.0%),106 (< 0.1%),4.31,1.22,0.0,4.6,12.7
7,Air_Temperature_(°C),Float64DType,0 (0.0%),345 (< 0.1%),286.0,4.08,268.0,286.0,307.0
8,Relative_Humidity_(%),Float64DType,0 (0.0%),77 (< 0.1%),77.5,10.2,24.0,79.0,100.0
9,Visibility_(m),Float64DType,0 (0.0%),1137 (0.2%),11300.0,8540.0,120.0,7000.0,60000.0

Column 1,Column 2,Cramér's V
weekday,is_weekend,1.0
year,month,0.834
month,is_school_holiday,0.677
bike_count,log_bike_count,0.622
latitude,longitude,0.553
"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",0.492
counter_name,latitude,0.41
counter_name,longitude,0.398
Air_Temperature_(°C),Relative_Humidity_(%),0.376
Relative_Humidity_(%),Visibility_(m),0.332


In [12]:
TableReport(df_test)

Processing column  21 / 21


Unnamed: 0_level_0,counter_name,latitude,longitude,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday,road_work
Unnamed: 0_level_1,counter_name,latitude,longitude,Pressure_Tendency_(hPa/3h),Wind_Speed_(m/s),Air_Temperature_(°C),Relative_Humidity_(%),Visibility_(m),Total_Cloud_Cover_(oktas),Snow_Height_(cm),"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",year,month,weekday,day,hour,is_weekend,is_school_holiday,is_public_holiday,road_work
0.0,152 boulevard du Montparnasse E-O,48.840801,2.333233,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0,0.0
1.0,152 boulevard du Montparnasse O-E,48.840801,2.333233,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0,0.0
2.0,18 quai de l'Hôtel de Ville NO-SE,48.85372,2.35702,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0,0.0
3.0,18 quai de l'Hôtel de Ville SE-NO,48.85372,2.35702,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0,0.0
4.0,20 Avenue de Clichy NO-SE,48.88529,2.32666,-60.0,4.6,286.25,79.0,7000.0,90.0,0.0,0.6,0.6,2021.0,9.0,4.0,10.0,1.0,0.0,0.0,0.0,0.0
,,,,,,,,,,,,,,,,,,,,,
51435.0,Totem 85 quai d'Austerlitz SE-NO,48.84201,2.36729,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0,0.0
51436.0,Totem Cours la Reine E-O,48.86462,2.31444,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0,0.0
51437.0,Totem Cours la Reine O-E,48.86462,2.31444,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0,0.0
51438.0,Voie Georges Pompidou NE-SO,48.8484,2.27586,0.0,3.6,288.35,95.0,10000.0,100.0,0.0,1.8,1.8,2021.0,10.0,0.0,18.0,21.0,0.0,0.0,0.0,0.0

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,counter_name,CategoricalDtype,0 (0.0%),56 (0.1%),,,,,
1,latitude,Float64DType,0 (0.0%),30 (< 0.1%),48.9,0.0186,48.8,48.9,48.9
2,longitude,Float64DType,0 (0.0%),30 (< 0.1%),2.34,0.0383,2.27,2.35,2.41
3,Pressure_Tendency_(hPa/3h),Float64DType,0 (0.0%),57 (0.1%),-39.0,74.7,-410.0,-60.0,680.0
4,Wind_Speed_(m/s),Float64DType,0 (0.0%),67 (0.1%),4.01,1.28,0.0,4.6,9.8
5,Air_Temperature_(°C),Float64DType,0 (0.0%),150 (0.3%),287.0,2.64,277.0,286.0,299.0
6,Relative_Humidity_(%),Float64DType,0 (0.0%),56 (0.1%),78.6,8.72,40.0,79.0,99.0
7,Visibility_(m),Float64DType,0 (0.0%),152 (0.3%),11600.0,8900.0,200.0,7000.0,57500.0
8,Total_Cloud_Cover_(oktas),Float64DType,0 (0.0%),9 (< 0.1%),80.1,24.2,0.0,90.0,100.0
9,Snow_Height_(cm),Float64DType,0 (0.0%),1 (< 0.1%),0.0,0.0,,,

Column 1,Column 2,Cramér's V
weekday,is_weekend,1.0
month,day,0.701
latitude,longitude,0.564
"Rainfall_(3h,_mm)","Rainfall_(12h,_mm)",0.487
day,is_weekend,0.475
counter_name,latitude,0.447
Wind_Speed_(m/s),"Rainfall_(3h,_mm)",0.442
Pressure_Tendency_(hPa/3h),"Rainfall_(12h,_mm)",0.426
counter_name,longitude,0.41
Wind_Speed_(m/s),Air_Temperature_(°C),0.403


## Test using flaml and the GPU

In [None]:
import pandas as pd
from flaml import AutoML
from skrub import TableVectorizer


# Preprocess the dataset
X = df.drop(columns=['log_bike_count', 'bike_count'])
y = df['log_bike_count']

# Split the data into training and validation sets based on the last 10% of dates
validation_split_index = int(len(df) * 0.9)
X_train, X_val = X.iloc[:validation_split_index], X.iloc[validation_split_index:]
y_train, y_val = y.iloc[:validation_split_index], y.iloc[validation_split_index:]

# Initialize the TableVectorizer
vectorizer = TableVectorizer()

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)



In [None]:
import optuna
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from ngboost import NGBRegressor
from h2o.automl import H2OAutoML
import h2o

# Initialize H2O
h2o.init()

# Dictionary to store the best parameters for each model
best_params = {}

# Define the objective function for Optuna
def objective(trial):
    # Model selection
    model_name = trial.suggest_categorical("model", ["RandomForest", "NGBoost", "H2OAutoML"])
    
    if model_name == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 50, 500)
        max_depth = trial.suggest_int("max_depth", 2, 32)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42,
        )
        model.fit(X_train_transformed, y_train)
        y_pred = model.predict(X_val_transformed)
    
    elif model_name == "NGBoost":
        learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
        n_estimators = trial.suggest_int("n_estimators", 50, 500)
        model = NGBRegressor(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            random_state=42,
        )
        model.fit(X_train_transformed, y_train)
        y_pred = model.predict(X_val_transformed)
    
    elif model_name == "H2OAutoML":
        # Convert datasets to H2O frames
        train = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
        val = h2o.H2OFrame(pd.concat([X_val, y_val], axis=1))
        
        # Specify predictors and response column
        predictors = X_train.columns.tolist()
        response = "log_bike_count"  # Update with your target column name
        
        # Run H2O AutoML
        automl = H2OAutoML(max_models=10, seed=42, nfolds=3)
        automl.train(x=predictors, y=response, training_frame=train)
        
        # Predict on validation set
        y_pred = automl.leader.predict(val).as_data_frame()["predict"].values

    # Compute the Mean Squared Error (MSE)
    mse = mean_squared_error(y_val, y_pred)
    return mse

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Get the best trial and parameters
best_trial = study.best_trial
best_model_params = study.best_params
print("Best Trial:", best_trial)
print("Best Model Parameters:", best_model_params)

# Save the best model
model_name = best_model_params["model"]
if model_name == "H2OAutoML":
    # Save H2O AutoML model
    automl.leader.save_mojo(f"best_{model_name}.mojo")
    print(f"Best H2O AutoML model saved as 'best_{model_name}.mojo'")
else:
    # Save sklearn or NGBoost models
    joblib.dump(best_model, f"best_{model_name}.joblib")
    print(f"Best model saved as 'best_{model_name}.joblib'")

# Shut down H2O
h2o.shutdown(prompt=False)


## Tune XGBoost using Optuna hyperparameter

In [None]:
from skrub import TableVectorizer
from xgboost import XGBRegressor
import optuna
from sklearn.model_selection import train_test_split

# Preprocess the dataset using TableVectorizer
X = df.drop(columns=['log_bike_count', 'bike_count'])
y = df['log_bike_count']

# Split the data into training and validation sets based on the last 10% of dates
validation_split_index = int(len(df) * 0.9)
X_train, X_val = X.iloc[:validation_split_index], X.iloc[validation_split_index:]
y_train, y_val = y.iloc[:validation_split_index], y.iloc[validation_split_index:]

# Initialize the TableVectorizer
vectorizer = TableVectorizer()

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import joblib
from xgboost import XGBRegressor
import optuna

# Define the objective function for Optuna
def objective(trial):
    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'random_state': 42,
        'tree_method': 'gpu_hist',  # Enable GPU support
        'predictor': 'gpu_predictor'
    }
    model = XGBRegressor(**param)
    model.fit(X_train_transformed, y_train)
    return model.score(X_val_transformed, y_val)  # Maximizing validation R² score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params

# Add GPU-specific parameter to the best parameters
best_params['tree_method'] = 'gpu_hist'  # Ensure GPU is used for the final model

# Save the best parameters to a file using joblib
joblib.dump(best_params, 'xg_boost_best_params')

# Train the final model with the best parameters
X_transformed = vectorizer.transform(X)  # Transform the entire dataset
final_model = XGBRegressor(**best_params)
final_model.fit(X_transformed, y)


In [None]:
import joblib
from xgboost import XGBRegressor
from skrub import TableVectorizer

# Load the best parameters from the pickle file
best_params = joblib.load('xg_boost_best_params.pkl')

# Check if GPU support is available
try:
    import xgboost
    if 'gpu_hist' in best_params.get('tree_method', '') and not xgboost.Booster().attr('gpu_id'):
        print("Warning: XGBoost is not compiled with GPU support. Falling back to CPU.")
        best_params.pop('tree_method', None)  # Remove GPU-specific parameters
        best_params.pop('predictor', None)
except Exception as e:
    print(f"Error while checking GPU support: {e}")

# Initialize the TableVectorizer
vectorizer = TableVectorizer()

# Fit and transform the data
X_transformed = vectorizer.fit_transform(X)

# Train the final model with the best parameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_transformed, y)

# Print model parameters
print("Trained model parameters:")
print(final_model.get_params())


In [None]:
# Transform the test data using the same vectorizer instance
X_test_transformed = vectorizer.transform(df_test)

# Make predictions
y_pred = final_model.predict(X_test_transformed)

print("Predictions:", y_pred)


In [None]:
df_submission = pd.DataFrame(y_pred, columns=["log_bike_count"])
df_submission.index = data_test.index
df_submission.index.name = "Id"
df_submission.to_csv("/Users/felix/Documents/X/Cours Python/Kaggle/submission/test_pipeline.csv", index=True)

In [None]:
test_data = pd.read_parquet('data/final_test.parquet')
# Merge the DataFrames
merged_conditions = pd.merge(test_data, filled_external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)

# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df_test = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df_test["year"] = df_test["date"].dt.year
df_test["month"] = df_test["date"].dt.month
df_test["weekday"] = df_test["date"].dt.dayofweek
df_test["day"] = df_test["date"].dt.day
df_test["hour"] = df_test["date"].dt.hour
df_test["is_weekend"] = (df_test["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates_test = df_test["date"].dt.date.unique()

try:
    dict_school_holidays_test = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates_test}
    df_test["is_school_holiday"] = df_test["date"].dt.date.map(dict_school_holidays_test).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df_test["is_school_holiday"] = 0

try:
    dict_public_holidays_test = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates_test}
    df_test["is_public_holiday"] = df_test["date"].dt.date.map(dict_public_holidays_test).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df_test["is_public_holiday"] = 0

# Predict using the pipeline
y_pred_test = pipeline.predict(df_test)

In [None]:
df_submission = pd.DataFrame(y_pred_test, columns=["log_bike_count"])
df_submission.index.name = "Id"
df_submission
df_submission.to_csv("/Users/felix/Documents/X/Cours Python/Kaggle/submission/test_pipeline.csv", index=True)

## En dessous c'est des tests d'avant ca ne fait pas tourner ce qui marche actuellement

In [None]:
# Add the new category to categorical columns
for col in df.select_dtypes(include=['category']).columns:
	df[col] = df[col].cat.add_categories([0])

# Fill NaN values with 0
df = df.fillna(0)

In [None]:
y_train = df['log_bike_count'].values
X_train = df.drop(['log_bike_count', "bike_count"], axis=1)

date_cols = ["year", "month", "weekday", "day", "hour", "is_weekend", "is_school_holiday", "is_public_holiday"]
categorical_cols = ["counter_name"]
numerical_cols = [
    'latitude', 'longitude', 'Sea Level Pressure (hPa)', 'Pressure Tendency (hPa/3h)',
    'Pressure Tendency Code', 'Wind Direction (°)', 'Wind Speed (m/s)', 'Air Temperature (°C)',
    'Dew Point Temperature (°C)', 'Relative Humidity (%)', 'Visibility (m)', 'Present Weather Code',
    'Past Weather Code 1', 'Past Weather Code 2', 'Total Cloud Cover (oktas)', 'Cloud Base Height (m)',
    'Lowest Cloud Base Height (m)', 'Low Cloud Type', 'Station Level Pressure (hPa)', '24h Pressure Tendency (hPa)',
    '10min Max Wind Gust (m/s)', 'Max Wind Gust (m/s)', 'Measurement Period Duration', 'Ground State',
    'Snow Height (cm)', 'New Snow Depth (cm)', 'New Snowfall Duration (hours)', 'Rainfall (1h, mm)',
    'Rainfall (3h, mm)', 'Rainfall (6h, mm)', 'Rainfall (12h, mm)', 'Rainfall (24h, mm)',
    'Layer 1 Cloud Cover (oktas)', 'Layer 1 Cloud Type', 'Layer 1 Cloud Base Height (m)'
]


# 1. Apply column transformations
# One-hot encode date columns
date_encoder = OneHotEncoder(handle_unknown="ignore")
date_encoded = date_encoder.fit_transform(X_train[date_cols])

# One-hot encode categorical columns
cat_encoder = OneHotEncoder(handle_unknown="ignore")
cat_encoded = cat_encoder.fit_transform(X_train[categorical_cols])

# Standard scale numerical columns
num_scaler = StandardScaler()
num_scaled = num_scaler.fit_transform(X_train[numerical_cols])

X_transformed = hstack([date_encoded, cat_encoded, num_scaled]).toarray()

# 2. Train the model
model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_transformed, y_train)



In [None]:
X_test = utils.get_test_data()

In [None]:
# Merge the DataFrames
merged_conditions = pd.merge(X_test, external_conditions, on='date', how='left')

merged_conditions = utils._column_rename(merged_conditions)
# Ensure "date" is in datetime format
merged_conditions["date"] = pd.to_datetime(merged_conditions["date"], errors="coerce")

# Drop rows with invalid datetime entries
df = merged_conditions.dropna(subset=["date"])

# Extract date and time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Handle school and public holidays
unique_dates = df["date"].dt.date.unique()
d = SchoolHolidayDates()
f = JoursFeries()

try:
    dict_school_holidays = {date: d.is_holiday_for_zone(date, "C") for date in unique_dates}
    df["is_school_holiday"] = df["date"].dt.date.map(dict_school_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with school holidays mapping: {e}")
    df["is_school_holiday"] = 0

try:
    dict_public_holidays = {date: f.is_bank_holiday(date, zone="Métropole") for date in unique_dates}
    df["is_public_holiday"] = df["date"].dt.date.map(dict_public_holidays).fillna(0).astype(int)
except Exception as e:
    print(f"Error with public holidays mapping: {e}")
    df["is_public_holiday"] = 0

In [None]:
# Process the test data with the same transformations as the training data
# 1. Apply column transformations
# One-hot encode date columns
date_encoded_test = date_encoder.transform(df[date_cols])

# One-hot encode categorical columns
cat_encoded_test = cat_encoder.transform(df[categorical_cols])

# Standard scale numerical columns
num_scaled_test = num_scaler.transform(df[numerical_cols])

# Combine all transformed features
X_test_transformed_numeric = hstack([date_encoded_test, cat_encoded_test, num_scaled_test]).toarray()

# 2. Make predictions
y_pred = model.predict(X_test_transformed_numeric)


In [None]:
df_submission = pd.DataFrame(y_pred, columns=["log_bike_count"])
df_submission.index.name = "Id"

In [None]:
df_submission.to_csv("/Users/felix/Documents/X/Cours Python/Kaggle/submission/test_pipeline.csv", index=True)