In [49]:
import pandas as pd
import numpy as np
import holidays
from pathlib import Path
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from flaml import AutoML
from xgboost import XGBRegressor
from skrub import TableVectorizer, DatetimeEncoder
import optuna
import importlib
import bike_count as bc
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error




In [62]:
importlib.reload(bc)

<module 'bike_count' from 'c:\\Users\\diane\\bike_count_ldmh\\bike_count.py'>

In [46]:
X, y = bc.get_model_data()
X_train, X_val, y_train, y_val = bc.train_test_temporal(X, y)

date = bc.date_encoder
table = bc.table_vectorizer
merg = bc.merge
X_train = merg.fit_transform(X_train)
X_train = date.fit_transform(X_train)
X_train = table.fit_transform(X_train)

X_val = merg.fit_transform(X_val)
X_val = date.fit_transform(X_val)
X_val = table.fit_transform(X_val)


In [52]:
def objective(trial):
    param = {
        'objective': 'reg:squarederror',  # ou 'binary:logistic' pour un problème de classification binaire
        'eval_metric': 'rmse',  # Pour un problème de régression, ou 'logloss' pour classification
        'max_depth': trial.suggest_int('max_depth', 8, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'alpha': trial.suggest_float('alpha', 1e-4, 1e-1, log=True),  # L2 regularization term
        'lambda': trial.suggest_float('lambda', 1e-4, 1e-1, log=True)  # L1 regularization term
    }
    
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)  # ou accuracy_score pour classification

    return rmse

In [53]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Meilleurs paramètres : ", study.best_params)
print("Meilleure métrique (RMSE) : ", study.best_value)

[I 2024-12-11 14:40:12,456] A new study created in memory with name: no-name-6ed17aa9-ebca-42c7-a080-a01a61f77dec
[I 2024-12-11 14:40:26,283] Trial 0 finished with value: 0.5073129696342121 and parameters: {'max_depth': 13, 'min_child_weight': 8, 'subsample': 0.5814628087384928, 'colsample_bytree': 0.5163024939871281, 'alpha': 0.03820672568871395, 'lambda': 0.0008184053339948642}. Best is trial 0 with value: 0.5073129696342121.
[I 2024-12-11 14:40:36,805] Trial 1 finished with value: 0.4701998540573632 and parameters: {'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.9661587312086384, 'colsample_bytree': 0.5577834022158903, 'alpha': 0.00011450047923153245, 'lambda': 0.015537963341439335}. Best is trial 1 with value: 0.4701998540573632.
[I 2024-12-11 14:40:46,458] Trial 2 finished with value: 0.4430363427378354 and parameters: {'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.8444312146122992, 'colsample_bytree': 0.7599124475202146, 'alpha': 0.009405790076855412, 'lambda': 0

Meilleurs paramètres :  {'max_depth': 10, 'min_child_weight': 7, 'subsample': 0.8972852751497171, 'colsample_bytree': 0.7366839097750602, 'alpha': 0.002644395912568715, 'lambda': 0.00025636265208962237}
Meilleure métrique (RMSE) :  0.4221676694191395


In [63]:
X, y = bc.get_model_data()
pipe = bc.xgb_vectorized_no_date_encoding()

pipe.fit(X, y)

In [64]:
test_data = pd.read_parquet("data/final_test.parquet")
test_pred = pipe.predict(test_data)

test_df = pd.DataFrame({"Id": range(len(test_pred)), "log_bike_count": test_pred})
test_df.to_csv("opti_predictions.csv", index=False)

In [5]:
study = optuna.create_study(direction="minimize")
study.optimize(bc.objective, n_trials=50)

print("Meilleurs paramètres : ", study.best_params)
print("Meilleure métrique (RMSE) : ", study.best_value)


[I 2024-12-11 12:16:49,541] A new study created in memory with name: no-name-e10a81f1-b967-433e-852b-f4b242f23465
[I 2024-12-11 12:18:19,616] Trial 0 finished with value: 0.49843714390187704 and parameters: {'learning_rate': 0.1796755184532604, 'max_depth': 8, 'n_estimators': 111}. Best is trial 0 with value: 0.49843714390187704.
[I 2024-12-11 12:19:30,324] Trial 1 finished with value: 0.690716987124988 and parameters: {'learning_rate': 0.01835105263985338, 'max_depth': 6, 'n_estimators': 122}. Best is trial 0 with value: 0.49843714390187704.
[I 2024-12-11 12:20:45,988] Trial 2 finished with value: 0.44309938739833055 and parameters: {'learning_rate': 0.05830528906289511, 'max_depth': 14, 'n_estimators': 63}. Best is trial 2 with value: 0.44309938739833055.
[I 2024-12-11 12:22:52,973] Trial 3 finished with value: 0.5182766825295525 and parameters: {'learning_rate': 0.1718320626492706, 'max_depth': 4, 'n_estimators': 148}. Best is trial 2 with value: 0.44309938739833055.
[I 2024-12-11 1

Meilleurs paramètres :  {'learning_rate': 0.16878974156327872, 'max_depth': 11, 'n_estimators': 139}
Meilleure métrique (RMSE) :  0.42235674625423486


### Predictions

In [24]:
importlib.reload(bc)

<module 'bike_count' from 'c:\\Users\\diane\\bike_count_ldmh\\bike_count.py'>

In [25]:
X, y = bc.get_model_data()

pipe = bc.xgb_vectorized_no_date_encoding()
pipe.fit(X, y)

test_data = pd.read_parquet("data/final_test.parquet")
test_pred = pipe.predict(test_data)

test_df = pd.DataFrame({"Id": range(len(test_pred)), "log_bike_count": test_pred})
test_df.to_csv("optimized_predictions.csv", index=False)


KeyboardInterrupt: 