In [4]:
import pandas as pd
import numpy as np
import holidays
from pathlib import Path
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from flaml import AutoML
from xgboost import XGBRegressor
from skrub import TableVectorizer, DatetimeEncoder
import optuna
import importlib
import bike_count as bc
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error

In [None]:
def xgb_vectorized_for_optuna(trial=None):

    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3) if trial else 0.1
    max_depth = trial.suggest_int("max_depth", 3, 15) if trial else 10
    n_estimators = trial.suggest_int("n_estimators", 50, 200) if trial else 100

    regressor = XGBRegressor(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        tree_method='hist',
        enable_categorical=True,
    )

    pipe = make_pipeline(regressor)
    
    return pipe

In [5]:
importlib.reload(bc)

<module 'bike_count' from 'c:\\Users\\diane\\bike_count_ldmh\\bike_count.py'>

In [9]:
X, y = bc.get_model_data()

date = bc.date_encoder
table = bc.table_vectorizer
merg = bc.merge

X = merg.fit_transform(X)
X = table.fit_transform(X)

X_train, X_val, y_train, y_val = bc.train_test_temporal(X, y)

X_train = date.fit_transform(X_train)
X_val = date.fit_transform(X_val)

KeyError: 'date'

In [7]:
def objective(trial):
    param = {
        "objective": "reg:squarederror",  # ou 'binary:logistic' pour un problème de classification binaire
        "eval_metric": "rmse",  # Pour un problème de régression, ou 'logloss' pour classification
        "max_depth": trial.suggest_int("max_depth", 8, 15),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "alpha": trial.suggest_float(
            "alpha", 1e-4, 1e-1, log=True
        ),  # L2 regularization term
        "lambda": trial.suggest_float("lambda", 1e-4, 1e-1, log=True),
        "n_estimators": trial.suggest_float(
            "n_estimators", 50, 150
        ),  # L1 regularization term
    }

    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(
        y_val, y_pred
    )  # ou accuracy_score pour classification

    return rmse

In [8]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200)

print("Meilleurs paramètres : ", study.best_params)
print("Meilleure métrique (RMSE) : ", study.best_value)

[I 2024-12-11 17:47:32,618] A new study created in memory with name: no-name-0df5886e-ffdb-446a-b8c4-b9cf0b6ab1b0
[W 2024-12-11 17:47:35,686] Trial 0 failed with parameters: {'max_depth': 13, 'min_child_weight': 2, 'subsample': 0.992601613350111, 'colsample_bytree': 0.9138727331980381, 'alpha': 0.0016369844657114457, 'lambda': 0.00014658034383620126, 'n_estimators': 111.63797643786754} because of the following error: ValueError('feature_names mismatch: [\'counter_id: 100057445, 104057445, 103057445\', \'counter_id: 100056329, 103056329, 104056329\', \'counter_id: 102007049, 101007049, 100007049\', \'counter_id: 100047542, 104047542, 103047542\', \'counter_id: 100044493, sc, 100049407\', \'counter_id: 104036719, 100036719, 103036719\', \'counter_id: 100056330, 103056330, 104056330\', \'counter_id: 102060178, 101060178, 100060178\', \'counter_id: 100057329, 104057329, 103057329\', \'counter_id: 100056046, 100056047, sc\', \'counter_id: 353277235, 353277233, 100063175\', \'counter_id: 100

ValueError: feature_names mismatch: ['counter_id: 100057445, 104057445, 103057445', 'counter_id: 100056329, 103056329, 104056329', 'counter_id: 102007049, 101007049, 100007049', 'counter_id: 100047542, 104047542, 103047542', 'counter_id: 100044493, sc, 100049407', 'counter_id: 104036719, 100036719, 103036719', 'counter_id: 100056330, 103056330, 104056330', 'counter_id: 102060178, 101060178, 100060178', 'counter_id: 100057329, 104057329, 103057329', 'counter_id: 100056046, 100056047, sc', 'counter_id: 353277235, 353277233, 100063175', 'counter_id: 100050876, 104050876, 103050876', 'counter_id: 100057380, 104057380, 103057380', 'counter_id: 100047547, 103047547, 103047546', 'counter_id: 100056226, 104056226, 103056226', 'counter_id: 109042374, 110042374, 100042374', 'counter_id: 100056331, 103056331, 104056331', 'counter_id: 104036718, 100036718, 103036718', 'counter_id: 105056336, 106056336, 100056336', 'counter_id: 104056332, 104056330, 104056335', 'counter_id: 100056327, 103056327, 104056327', 'counter_id: 100047545, 104047545, 103047545', 'counter_id: 353255859, 353255860, 100049407', 'counter_id: 300014702, 353245971, 353245972', 'counter_id: 100056335, 103056335, 104056335', 'counter_id: 104047546, 100047546, 104047547', 'counter_id: 100056223, sc, 100056226', 'counter_id: 100056334, 103056334, 104056334', 'counter_id: 100047548, 103047548, 104047548', 'counter_id: 103056330, 103056332, 103056226', 'counter_name: austerlitz, 85, totem', 'counter_name: concorde, la, de', 'counter_name: orsay, quai, 67', 'counter_name: charenton, porte, la', 'counter_name: grenelle, 36, ne', 'counter_name: sèvres, vaugirard, 254', 'counter_name: montparnasse, 152, du', 'counter_name: 70, face, au', 'counter_name: aubervilliers, 104, face', 'counter_name: pompidou, georges, voie', 'counter_name: mauriac, françois, 39', 'counter_name: rivoli, 64, totem', 'counter_name: invalides, des, de', 'counter_name: sébastopol, 73, de', 'counter_name: marne, 48, face', 'counter_name: bartet, julia, rue', 'counter_name: hôtel, ville, 18', 'counter_name: clichy, avenue, 20', 'counter_name: boulevard, diderot, voltaire', 'counter_name: cours, reine, totem', 'counter_name: tournelle, 27, la', 'counter_name: bercy, pont, so', 'counter_name: charles, gaulle, au', 'counter_name: bagnolet, porte, la', 'counter_name: turbigo, rue, 38', 'counter_name: issy, face, 40', 'counter_name: oise, 25, 254', 'counter_name: so, ne, 67', 'counter_name: no, se, 67', 'counter_name: françois, mauriac, 39', 'site_id', 'site_name_152 boulevard du Montparnasse', "site_name_18 quai de l'Hôtel de Ville", 'site_name_20 Avenue de Clichy', 'site_name_254 rue de Vaugirard', 'site_name_27 quai de la Tournelle', 'site_name_28 boulevard Diderot', 'site_name_36 quai de Grenelle', 'site_name_38 rue Turbigo', 'site_name_39 quai François Mauriac', 'site_name_6 rue Julia Bartet', 'site_name_67 boulevard Voltaire SE-NO', 'site_name_90 Rue De Sèvres', "site_name_Face 104 rue d'Aubervilliers", "site_name_Face au 25 quai de l'Oise", 'site_name_Face au 4 avenue de la porte de Bagnolet', "site_name_Face au 40 quai D'Issy", 'site_name_Face au 48 quai de la marne', 'site_name_Face au 70 quai de Bercy', 'site_name_Face au 8 avenue de la porte de Charenton', 'site_name_Pont Charles De Gaulle', 'site_name_Pont de Bercy', 'site_name_Pont de la Concorde S-N', 'site_name_Pont des Invalides N-S', 'site_name_Pont des Invalides S-N', "site_name_Quai d'Orsay", 'site_name_Totem 64 Rue de Rivoli', 'site_name_Totem 73 boulevard de Sébastopol', "site_name_Totem 85 quai d'Austerlitz", 'site_name_Totem Cours la Reine', 'site_name_Voie Georges Pompidou', 'counter_installation_date_year', 'counter_installation_date_month', 'coordinates_48.82636,2.30303', 'coordinates_48.829523,2.38699', 'coordinates_48.830331,2.400551', 'coordinates_48.83421,2.26542', 'coordinates_48.83436,2.377', 'coordinates_48.83848,2.37587', 'coordinates_48.83977,2.30198', 'coordinates_48.840801,2.333233', 'coordinates_48.84201,2.36729', 'coordinates_48.84223,2.36811', 'coordinates_48.846028,2.375429', 'coordinates_48.84638,2.31529', 'coordinates_48.8484,2.27586', 'coordinates_48.85013,2.35423', 'coordinates_48.85209,2.28508', 'coordinates_48.85372,2.35702', 'coordinates_48.85735,2.35211', 'coordinates_48.86149,2.37376', 'coordinates_48.86282,2.31061', 'coordinates_48.86284,2.310345', 'coordinates_48.86288,2.31179', 'coordinates_48.86377,2.35096', 'coordinates_48.86378,2.32003', 'coordinates_48.86461,2.40969', 'coordinates_48.86462,2.31444', 'coordinates_48.86502,2.35387', 'coordinates_48.88529,2.32666', 'coordinates_48.890457,2.368852', 'coordinates_48.89141,2.38482', 'coordinates_48.89172,2.38531', 'counter_technical_id_Y2H15027244', 'counter_technical_id_Y2H17021629', 'counter_technical_id_Y2H18034809', 'counter_technical_id_Y2H18086318', 'counter_technical_id_Y2H18086321', 'counter_technical_id_Y2H18086323', 'counter_technical_id_Y2H18086324', 'counter_technical_id_Y2H19027732', 'counter_technical_id_Y2H19070356', 'counter_technical_id_Y2H19070357', 'counter_technical_id_Y2H19070365', 'counter_technical_id_Y2H19070370', 'counter_technical_id_Y2H19070372', 'counter_technical_id_Y2H19070373', 'counter_technical_id_Y2H19070375', 'counter_technical_id_Y2H19070376', 'counter_technical_id_Y2H19070377', 'counter_technical_id_Y2H19070378', 'counter_technical_id_Y2H19070380', 'counter_technical_id_Y2H19070382', 'counter_technical_id_Y2H19070383', 'counter_technical_id_Y2H20022165', 'counter_technical_id_Y2H20052705', 'counter_technical_id_Y2H20073268', 'counter_technical_id_Y2H20114504', 'counter_technical_id_Y2H21025335', 'counter_technical_id_YTH19037970', 'counter_technical_id_YTH19111508', 'counter_technical_id_YTH19111509', 'counter_technical_id_YTH19111510', 'latitude', 'longitude', 'pres', 'tend', 'rr24', 'rr12', 'rr3', 'month_day', 'week_day', 'year', 'month', 'hour', 'is_holiday', 'covid_state', 'is_school_holiday'] ['counter_id: 103047547, 103047542, 103047545', 'counter_id: 104056330, 104056332, 104056335', 'counter_id: 100056223, sc, 100056226', 'counter_id: 104036719, 100036719, 104036718', 'counter_id: 100056329, 103056329, 104056329', 'counter_id: 100057380, 104057380, 103057380', 'counter_id: 100050876, 104050876, 103050876', 'counter_id: 104047546, 100047546, 103047546', 'counter_id: 102007049, 101007049, 100007049', 'counter_id: 102060178, 101060178, 100060178', 'counter_id: 353277235, 353277233, 100063175', 'counter_id: 353255859, 353255860, 100049407', 'counter_id: 353245972, 353245971, 300014702', 'counter_id: 100057329, 104057329, 103057329', 'counter_id: 105056336, 106056336, 100056336', 'counter_id: 100056335, 103056335, 104056335', 'counter_id: 100056226, 103056226, 104056226', 'counter_id: 100056327, 103056327, 104056327', 'counter_id: 100057445, 104057445, 103057445', 'counter_id: 109042374, 110042374, 100042374', 'counter_id: 104047547, 100047547, 103047547', 'counter_id: 100056334, 103056334, 104056334', 'counter_id: 104047545, 100047545, 103047545', 'counter_id: 100056331, 103056331, 104056331', 'counter_id: 100056046, 100056047, sc', 'counter_id: 100036718, 103036718, 103036719', 'counter_id: 104047548, 100047548, 103047548', 'counter_id: 100044493, sc, 100049407', 'counter_id: 100056332, 103056332, 103056330', 'counter_id: 104047542, 100047542, 103047542', 'counter_name: marne, 48, face', 'counter_name: bercy, pont, so', 'counter_name: charenton, porte, la', 'counter_name: tournelle, 27, la', 'counter_name: montparnasse, 152, du', 'counter_name: sébastopol, 73, de', 'counter_name: sèvres, 90, rue', 'counter_name: rivoli, 64, totem', 'counter_name: austerlitz, 85, no', 'counter_name: françois, mauriac, 39', 'counter_name: pompidou, georges, voie', 'counter_name: aubervilliers, 104, face', 'counter_name: orsay, quai, 73', 'counter_name: bagnolet, porte, la', 'counter_name: clichy, avenue, 20', 'counter_name: cours, reine, totem', 'counter_name: 70, face, au', 'counter_name: diderot, boulevard, 28', 'counter_name: invalides, des, de', 'counter_name: charles, gaulle, au', 'counter_name: bartet, julia, rue', 'counter_name: hôtel, ville, 18', 'counter_name: issy, 40, face', 'counter_name: vaugirard, 254, 25', 'counter_name: oise, 25, face', 'counter_name: grenelle, 36, ne', 'counter_name: turbigo, 38, rue', 'counter_name: concorde, pont, la', 'counter_name: voltaire, 67, no', 'counter_name: charenton, porte, la (29)', 'site_id', 'site_name_152 boulevard du Montparnasse', "site_name_18 quai de l'Hôtel de Ville", 'site_name_20 Avenue de Clichy', 'site_name_254 rue de Vaugirard', 'site_name_27 quai de la Tournelle', 'site_name_28 boulevard Diderot', 'site_name_36 quai de Grenelle', 'site_name_38 rue Turbigo', 'site_name_39 quai François Mauriac', 'site_name_6 rue Julia Bartet', 'site_name_67 boulevard Voltaire SE-NO', 'site_name_90 Rue De Sèvres', "site_name_Face 104 rue d'Aubervilliers", "site_name_Face au 25 quai de l'Oise", 'site_name_Face au 4 avenue de la porte de Bagnolet', "site_name_Face au 40 quai D'Issy", 'site_name_Face au 48 quai de la marne', 'site_name_Face au 70 quai de Bercy', 'site_name_Face au 8 avenue de la porte de Charenton', 'site_name_Pont Charles De Gaulle', 'site_name_Pont de Bercy', 'site_name_Pont de la Concorde S-N', 'site_name_Pont des Invalides N-S', 'site_name_Pont des Invalides S-N', "site_name_Quai d'Orsay", 'site_name_Totem 64 Rue de Rivoli', 'site_name_Totem 73 boulevard de Sébastopol', "site_name_Totem 85 quai d'Austerlitz", 'site_name_Totem Cours la Reine', 'site_name_Voie Georges Pompidou', 'counter_installation_date_year', 'counter_installation_date_month', 'coordinates_48.82636,2.30303', 'coordinates_48.829523,2.38699', 'coordinates_48.830331,2.400551', 'coordinates_48.83421,2.26542', 'coordinates_48.83436,2.377', 'coordinates_48.83848,2.37587', 'coordinates_48.83977,2.30198', 'coordinates_48.840801,2.333233', 'coordinates_48.84201,2.36729', 'coordinates_48.84223,2.36811', 'coordinates_48.846028,2.375429', 'coordinates_48.84638,2.31529', 'coordinates_48.8484,2.27586', 'coordinates_48.85013,2.35423', 'coordinates_48.85209,2.28508', 'coordinates_48.85372,2.35702', 'coordinates_48.85735,2.35211', 'coordinates_48.86149,2.37376', 'coordinates_48.86282,2.31061', 'coordinates_48.86284,2.310345', 'coordinates_48.86288,2.31179', 'coordinates_48.86377,2.35096', 'coordinates_48.86378,2.32003', 'coordinates_48.86461,2.40969', 'coordinates_48.86462,2.31444', 'coordinates_48.86502,2.35387', 'coordinates_48.88529,2.32666', 'coordinates_48.890457,2.368852', 'coordinates_48.89141,2.38482', 'coordinates_48.89172,2.38531', 'counter_technical_id_Y2H15027244', 'counter_technical_id_Y2H17021629', 'counter_technical_id_Y2H18034809', 'counter_technical_id_Y2H18086318', 'counter_technical_id_Y2H18086321', 'counter_technical_id_Y2H18086323', 'counter_technical_id_Y2H18086324', 'counter_technical_id_Y2H19027732', 'counter_technical_id_Y2H19070356', 'counter_technical_id_Y2H19070357', 'counter_technical_id_Y2H19070365', 'counter_technical_id_Y2H19070370', 'counter_technical_id_Y2H19070372', 'counter_technical_id_Y2H19070373', 'counter_technical_id_Y2H19070375', 'counter_technical_id_Y2H19070376', 'counter_technical_id_Y2H19070377', 'counter_technical_id_Y2H19070378', 'counter_technical_id_Y2H19070380', 'counter_technical_id_Y2H19070382', 'counter_technical_id_Y2H19070383', 'counter_technical_id_Y2H20022165', 'counter_technical_id_Y2H20052705', 'counter_technical_id_Y2H20073268', 'counter_technical_id_Y2H20114504', 'counter_technical_id_Y2H21025335', 'counter_technical_id_YTH19037970', 'counter_technical_id_YTH19111508', 'counter_technical_id_YTH19111509', 'counter_technical_id_YTH19111510', 'latitude', 'longitude', 'pres', 'tend', 'rr24', 'rr12', 'rr3', 'month_day', 'week_day', 'year', 'month', 'hour', 'is_holiday', 'covid_state', 'is_school_holiday']
expected counter_id: 300014702, 353245971, 353245972, counter_name: oise, 25, 254, counter_id: 104036718, 100036718, 103036718, counter_id: 100047545, 104047545, 103047545, counter_id: 100047542, 104047542, 103047542, counter_name: so, ne, 67, counter_id: 100047548, 103047548, 104047548, counter_name: mauriac, françois, 39, counter_name: orsay, quai, 67, counter_id: 104047546, 100047546, 104047547, counter_name: boulevard, diderot, voltaire, counter_id: 103056330, 103056332, 103056226, counter_id: 100056226, 104056226, 103056226, counter_id: 104036719, 100036719, 103036719, counter_id: 100047547, 103047547, 103047546, counter_name: concorde, la, de, counter_name: sèvres, vaugirard, 254, counter_name: issy, face, 40, counter_id: 100056330, 103056330, 104056330, counter_name: no, se, 67, counter_id: 104056332, 104056330, 104056335, counter_name: austerlitz, 85, totem, counter_name: turbigo, rue, 38 in input data
training data did not have the following fields: counter_name: sèvres, 90, rue, counter_id: 104036719, 100036719, 104036718, counter_id: 100056226, 103056226, 104056226, counter_name: oise, 25, face, counter_name: vaugirard, 254, 25, counter_id: 104056330, 104056332, 104056335, counter_id: 100036718, 103036718, 103036719, counter_name: concorde, pont, la, counter_name: diderot, boulevard, 28, counter_id: 104047546, 100047546, 103047546, counter_id: 103047547, 103047542, 103047545, counter_name: orsay, quai, 73, counter_name: voltaire, 67, no, counter_id: 104047542, 100047542, 103047542, counter_id: 104047547, 100047547, 103047547, counter_id: 104047545, 100047545, 103047545, counter_name: charenton, porte, la (29), counter_id: 100056332, 103056332, 103056330, counter_name: issy, 40, face, counter_id: 104047548, 100047548, 103047548, counter_name: austerlitz, 85, no, counter_name: turbigo, 38, rue, counter_id: 353245972, 353245971, 300014702

In [63]:
X, y = bc.get_model_data()
pipe = bc.xgb_vectorized_no_date_encoding()

pipe.fit(X, y)

In [64]:
test_data = pd.read_parquet("data/final_test.parquet")
test_pred = pipe.predict(test_data)

test_df = pd.DataFrame({"Id": range(len(test_pred)), "log_bike_count": test_pred})
test_df.to_csv("opti_predictions.csv", index=False)

In [5]:
study = optuna.create_study(direction="minimize")
study.optimize(bc.objective, n_trials=50)

print("Meilleurs paramètres : ", study.best_params)
print("Meilleure métrique (RMSE) : ", study.best_value)

[I 2024-12-11 12:16:49,541] A new study created in memory with name: no-name-e10a81f1-b967-433e-852b-f4b242f23465
[I 2024-12-11 12:18:19,616] Trial 0 finished with value: 0.49843714390187704 and parameters: {'learning_rate': 0.1796755184532604, 'max_depth': 8, 'n_estimators': 111}. Best is trial 0 with value: 0.49843714390187704.
[I 2024-12-11 12:19:30,324] Trial 1 finished with value: 0.690716987124988 and parameters: {'learning_rate': 0.01835105263985338, 'max_depth': 6, 'n_estimators': 122}. Best is trial 0 with value: 0.49843714390187704.
[I 2024-12-11 12:20:45,988] Trial 2 finished with value: 0.44309938739833055 and parameters: {'learning_rate': 0.05830528906289511, 'max_depth': 14, 'n_estimators': 63}. Best is trial 2 with value: 0.44309938739833055.
[I 2024-12-11 12:22:52,973] Trial 3 finished with value: 0.5182766825295525 and parameters: {'learning_rate': 0.1718320626492706, 'max_depth': 4, 'n_estimators': 148}. Best is trial 2 with value: 0.44309938739833055.
[I 2024-12-11 1

Meilleurs paramètres :  {'learning_rate': 0.16878974156327872, 'max_depth': 11, 'n_estimators': 139}
Meilleure métrique (RMSE) :  0.42235674625423486


### Predictions

In [24]:
importlib.reload(bc)

<module 'bike_count' from 'c:\\Users\\diane\\bike_count_ldmh\\bike_count.py'>

In [25]:
X, y = bc.get_model_data()

pipe = bc.xgb_vectorized_no_date_encoding()
pipe.fit(X, y)

test_data = pd.read_parquet("data/final_test.parquet")
test_pred = pipe.predict(test_data)

test_df = pd.DataFrame({"Id": range(len(test_pred)), "log_bike_count": test_pred})
test_df.to_csv("optimized_predictions.csv", index=False)

KeyboardInterrupt: 

In [10]:
importlib.reload(bc)

X, y = bc.get_model_data()
pipe = bc.rf_vectorized_no_date_encoding()

pipe.fit(X, y)

In [11]:
test_data = pd.read_parquet("data/final_test.parquet")
test_pred = pipe.predict(test_data)

test_df = pd.DataFrame({"Id": range(len(test_pred)), "log_bike_count": test_pred})
test_df.to_csv("optimized_predictions.csv", index=False)