## **Modello Decision Tree**

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

## Importazione dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/cleaned_data.csv')


In [4]:
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,hour,day_of_week,month,year,is_weekend,lag_1,lag_24,lag_168,traffic_volume
0,none,6.2,0.0,0.0,90.0,clouds,overcast clouds,9.0,1.0,10.0,2012.0,False,5765.0,4832.0,5545.0,5599.0
1,none,8.22,0.0,0.0,90.0,clouds,overcast clouds,10.0,1.0,10.0,2012.0,False,5599.0,4395.0,4516.0,4534.0
2,none,9.96,0.0,0.0,90.0,clouds,overcast clouds,11.0,1.0,10.0,2012.0,False,4534.0,4411.0,4767.0,4691.0
3,none,10.93,0.0,0.0,75.0,clouds,broken clouds,12.0,1.0,10.0,2012.0,False,4691.0,4648.0,5026.0,5067.0
4,none,11.99,0.0,0.0,75.0,clouds,broken clouds,13.0,1.0,10.0,2012.0,False,5067.0,4602.0,4918.0,5030.0


## **Train Test Split**

La dimensione del test set è fissata ad un anno e rappresenta il periodo 01/10/2017-30/09/2018

In [5]:
X = data.drop('traffic_volume',axis=1)
Y = data['traffic_volume']

from sklearn.model_selection import train_test_split

# Numero di righe per train - In modo da isolare l'ultimo anno intero (01/10/2017-30/09/2018) come test set
n_train = 26064

# Train set
xtrain = X.iloc[:n_train]   # se X è DataFrame
ytrain = Y.iloc[:n_train]   # se Y è Series/DataFrame

# Test set
xtest = X.iloc[n_train:]    # dalla riga n_train fino alla fine
ytest = Y.iloc[n_train:]

#xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.25,random_state=0, shuffle=False)

print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)



(26064, 15) (26064,)
(8664, 15) (8664,)


## **Modello - Decision Tree**

### Preprocessing

In [6]:
# Importazioni di Scikit-learn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler


# --- 1. Definizione delle Feature ---

# Queste colonne passeranno inalterate (sono già numeriche)
numeric_features = [
    'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'lag_1', 'lag_24', 'lag_168',
]
# Queste colonne saranno codificate in numeri
categorical_features = [
    'hour', 'month', 'day_of_week', 'holiday', 'weather_main','weather_description', 'is_weekend'
]


# --- 2. Definizione del Preprocessore (ColumnTransformer) ---

# OrdinalEncoder sostituisce il LabelEncoder, ma è molto più flessibile e sicuro.
ordinal_encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value',
    unknown_value=-1                   # Assegna il valore -1 alle categorie sconosciute (es. 'none')
)

# StandardScaler per la standardizzazione delle feature numeriche (media 0, dev. standard 1)
standard_scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        # Applica l'OrdinalEncoder alle feature categoriche
        ('ord_enc', ordinal_encoder, categorical_features),
        ('scaler', standard_scaler, numeric_features)
    ],
    # 'passthrough' mantiene tutte le altre colonne (le numeriche) inalterate
    remainder='drop'
)

Addestramento del modello senza alcun iperparametro e verifica delle metriche


In [7]:
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np



# ---  Creazione pipeline con il modello ---
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])

# ---  Addestramento sul training set ---
pipe.fit(xtrain, ytrain)

# ---  Predizione sul test set ---
pred = pipe.predict(xtest)


### **Metriche di valutazione**

In [8]:
import sklearn.metrics as metrics
print('R Squared : ',metrics.r2_score(ytest,pred))
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))

R Squared :  0.9666359992118891
Mean Absolute Error :  223.78404893813482
Mean Squared Error :  129551.49734533703
Root Mean Squared Error :  359.9326288978773


Il modello Decision Tree senza alcun iperparametro fornisce un punteggio R² del 68%.  
Proviamo ora a trovare il miglior iperparametro per il modello utilizzando la Grid Search.


### **Grid Search per la ricerca degli Iperparametri**

In [9]:
from sklearn.model_selection import GridSearchCV

parameter = {
    'model__max_depth': np.arange(2, 15),
    'model__min_samples_leaf': [1, 5, 10, 20]
}


from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

GS = GridSearchCV(pipe, parameter, cv=tscv, scoring='neg_mean_absolute_error')
GS.fit(X, Y)


In [10]:
GS.best_params_

{'model__max_depth': np.int64(13), 'model__min_samples_leaf': 10}

Dalla Grid Search abbiamo ottenuto che la **massima profondità ottimale** è 9.  
Settiamo l'iperparametro e verifichiamo l’accuratezza del modello.


In [None]:
pipe.set_params(model__max_depth=13, model__min_samples_leaf=10)

pipe.fit(xtrain, ytrain)

pred = pipe.predict(xtest)

In [None]:
print('R Squared : ',metrics.r2_score(ytest,pred))
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))
print('Bias Error')
print('Actual value :',np.mean(ytest))
print('Predicted value :',np.mean(pred))
print('Variance Error')
print('Actual value :',np.var(ytest,ddof=1))
print('Predicted value :',np.var(pred,ddof=1))

R Squared :  0.977614534567297
Mean Absolute Error :  187.10605078399567
Mean Squared Error :  86922.1465374257
Root Mean Squared Error :  294.82562055802697
Bias Error
Actual value : 3337.3080563250232
Predicted value : 3341.351133878535
Variance Error
Actual value : 3883420.718780745
Predicted value : 3837867.10808853


### **Conclusioni**

L’R² ≈ 0.977 indica che il modello spiega gran parte della variabilità del traffico e inoltre il MAE ≈ 187 e l’RMSE ≈ 294 risultano piuttosto positivi.

Il bias è contenuto (media predetta molto vicina a quella reale), mentre la varianza predetta leggermente inferiore a quella osservata suggerisce una tendenza a smussare i picchi, tipica degli alberi non profondi.

In sintesi, il Decision Tree è interpretabile e stabile, ma meno accurato sui valori estremi rispetto ad alternative più performanti.

### Esportazione Pipeline

In [14]:
import pickle

# Salva pipeline completa
with open("/content/drive/MyDrive/Models/dt_pipeline.pkl", "wb") as f:
    pickle.dump(pipe, f)

print("Pipeline DecisionTreeRegressor salvata correttamente!")



Pipeline DecisionTreeRegressor salvata correttamente!


## **Modello- Random Forest**


Addestramento del modello senza alcun iperparametro e verifica delle metriche.

In [None]:
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np



# --- 4. Creazione pipeline con il modello ---
pipe = Pipeline([
    ('preprocessor', preprocessor), # è lo stesso del Decision Tree
    ('model', RandomForestRegressor())
])

# --- 5. Addestramento sul training set ---
pipe.fit(xtrain, ytrain)

# --- 6. Predizione sul test set ---
pred = pipe.predict(xtest)


### **Metriche di valutazione**

In [None]:
print('R Squared : ',metrics.r2_score(ytest,pred))
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))

R Squared :  0.9833587738214484
Mean Absolute Error :  161.36362534626042
Mean Squared Error :  64617.42351540858
Root Mean Squared Error :  254.19957418416064


Il modello **Random Forest** prima del tuning degli iperparametri fornisce un punteggio R² del 98%, il MAE è 156 e il RMSE è 246.  
Proviamo ora a trovare i migliori iperparametri per il modello utilizzando la **Grid Search**.


### **Grid Search per la ricerca degli Iperparametri**

In [None]:
from sklearn.model_selection import GridSearchCV
parameter = {'model__max_depth': [50, 100, 200], 'model__n_estimators': [None, 5, 10, 20]}

from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

GS = GridSearchCV(pipe, parameter, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1) # Minimizziamo MAE
GS.fit(X, Y)

15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in 

In [None]:
GS.best_params_

Dalla Grid Search abbiamo ottenuto che la **massima profondità ottimale** è 200  e il numero di **stimatori (estimators)** è 20.  
Implementiamoli e verifichiamo le metriche del modello.


In [None]:
pipe.set_params(model__max_depth=200, model__n_estimators=20)

pipe.fit(xtrain, ytrain)

pred = pipe.predict(xtest)

In [None]:
print('R Squared : ',metrics.r2_score(ytest,pred))
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))
print('Bias Error')
print('Actual value :',np.mean(ytest))
print('Predicted value :',np.mean(pred))
print('Variance Error')
print('Actual value :',np.var(ytest,ddof=1))
print('Predicted value :',np.var(pred,ddof=1))

R Squared :  0.984243655230206
Mean Absolute Error :  156.30506001846723
Mean Squared Error :  61181.45334487536
Root Mean Squared Error :  247.34884949171555
Bias Error
Actual value : 3337.3080563250232
Predicted value : 3344.830011542013
Variance Error
Actual value : 3883420.718780745
Predicted value : 3816495.8444908694


## Addestramento su tutto il dataset

In [12]:
pipe.fit(X, Y)

### Esportazione Pipeline

In [13]:
import pickle

# Salva pipeline completa
with open("/content/drive/MyDrive/Models/rf_pipeline.pkl", "wb") as f:
    pickle.dump(pipe, f)

print("Pipeline DecisionTreeRegressor salvata correttamente!")


Pipeline DecisionTreeRegressor salvata correttamente!


### Conclusioni

Il **Random Forest** ha fornito un punteggio R² migliore rispetto al Decision Tree.  
Tuttavia, sia l'errore di bias che l'errore di varianza sono leggermente aumentati.  

DecisionTree
- R Squared :  0.977614534567297
- Mean Absolute Error :  187.10605078399567
- Mean Squared Error :  86922.1465374257
- Root Mean Squared Error :  294.82562055802697
- Bias Error
- Actual value : 3337.3080563250232
- Predicted value : 3341.351133878535
- Variance Error
- Actual value : 3883420.718780745
- Predicted value : 3837867.10808853

RandomForest
- R Squared :  0.984243655230206
- Mean Absolute Error :  156.30506001846723
- Mean Squared Error :  61181.45334487536
- Root Mean Squared Error :  247.34884949171555
- Bias Error
- Actual value : 3337.3080563250232
- Predicted value : 3344.830011542013
- Variance Error
- Actual value : 3883420.718780745
- Predicted value : 3816495.8444908694

## Modello Baseline "ora precedente"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# === 1. Preparazione dataset ===
df = pd.read_csv('/content/drive/MyDrive/Datasets/cleaned_data_with_datetime.csv')
df['date_time'] = pd.to_datetime(df['date_time'])
df = df.set_index('date_time')

# Introduzione dei gap
gap_hours = df.index.to_series().diff().dt.total_seconds() / 3600
df.loc[gap_hours > 1, 'traffic_volume'] = np.nan

# === 2. Divisione train/test percentuale ===
X = df.drop(columns=['traffic_volume'])
Y = df['traffic_volume']

xtrain, xtest, ytrain, ytest = train_test_split(
    X, Y, test_size=0.25, shuffle=False
)

# === 3. Baseline "ora precedente" ===
y_pred = ytest.shift(1)
time_diff = ytest.index.to_series().diff().dt.total_seconds() / 3600
mask = (time_diff == 1) & y_pred.notna()  # esclude righe con gap

y_true_aligned = ytest[mask]
y_pred_aligned = y_pred[mask]

# === 4. Metriche ===
mae_baseline = mean_absolute_error(y_true_aligned, y_pred_aligned)
rmse_baseline = np.sqrt(mean_squared_error(y_true_aligned, y_pred_aligned))

print(f"MAE baseline ora precedente: {mae_baseline}")
print(f"RMSE baseline ora precedente: {rmse_baseline}")


MAE baseline ora precedente: 590.8036816963765
RMSE baseline ora precedente: 817.8006040791096
