Questo Notebook ha il solo scopo di dimostrare i passi seguiti per il Pre-Processing dal notebook successivo.

## Requirements

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler,RobustScaler

## Importing the Cleaned Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/cleaned_data.csv')

In [None]:
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,hour,day_of_week,month,year,is_weekend,lag_1,lag_24,lag_168,traffic_volume
0,none,6.2,0.0,0.0,90.0,clouds,overcast clouds,9.0,1.0,10.0,2012.0,False,5765.0,4832.0,5545.0,5599.0
1,none,8.22,0.0,0.0,90.0,clouds,overcast clouds,10.0,1.0,10.0,2012.0,False,5599.0,4395.0,4516.0,4534.0
2,none,9.96,0.0,0.0,90.0,clouds,overcast clouds,11.0,1.0,10.0,2012.0,False,4534.0,4411.0,4767.0,4691.0
3,none,10.93,0.0,0.0,75.0,clouds,broken clouds,12.0,1.0,10.0,2012.0,False,4691.0,4648.0,5026.0,5067.0
4,none,11.99,0.0,0.0,75.0,clouds,broken clouds,13.0,1.0,10.0,2012.0,False,5067.0,4602.0,4918.0,5030.0


In [None]:
data.tail()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,hour,day_of_week,month,year,is_weekend,lag_1,lag_24,lag_168,traffic_volume
34723,,10.3,0.0,0.0,75.0,Clouds,broken clouds,19.0,6.0,9.0,2018.0,True,3947.0,3818.0,3516.0,3543.0
34724,,9.61,0.0,0.0,90.0,Clouds,overcast clouds,20.0,6.0,9.0,2018.0,True,3543.0,3338.0,2846.0,2781.0
34725,,9.58,0.0,0.0,90.0,Thunderstorm,proximity thunderstorm,21.0,6.0,9.0,2018.0,True,2781.0,2950.0,2346.0,2159.0
34726,,8.93,0.0,0.0,90.0,Clouds,overcast clouds,22.0,6.0,9.0,2018.0,True,2159.0,2607.0,1635.0,1450.0
34727,,8.97,0.0,0.0,90.0,Clouds,overcast clouds,23.0,6.0,9.0,2018.0,True,1450.0,3856.0,934.0,954.0


# Pre Processing per Linear Regressor


Per i modelli di Regressione Lineare, abbiamo dovuto applicare due trasformazioni essenziali al dataset per garantire la validità statistica e le prestazioni:

1. One-Hot Encoding (OHE): Usato per le variabili nominali per evitare di imporre relazioni ordinali false (es. Gennaio < Marzo), trattando ogni categoria come una feature binaria indipendente.

2. Standardizzazione (Z-score Scaling): Applicata a tutte le features numeriche continue (lag_features e altre) per garantire che i coefficienti del modello non siano distorti dalle diverse scale delle variabili.

## Conversione degli attributi categorici

Per un modello di regressione lineare, le variabili categoriche devono essere trasformate in numeriche: l’One-Hot Encoding crea variabili binarie per ogni categoria evitando di introdurre ordini fittizi o multicollinearità. La normalizzazione delle variabili numeriche rende tutte le feature comparabili e stabilizza il calcolo dei coefficienti, migliorando interpretabilità e stabilità del modello. Insieme, OHE e normalizzazione permettono alla regressione lineare di gestire correttamente dati misti (numerici e categorici).

In [4]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Copia dei dati
df_linreg = data.copy()

# --- Colonne nominali da codificare ---
nominal_cols_ohe = ['hour', 'month', 'day_of_week', 'holiday', 'weather_main','weather_description', 'is_weekend']

# --- 1. Crea e fit encoder ---
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
encoder.fit(df_linreg[nominal_cols_ohe])

# --- 2. Trasforma le colonne nominali ---
encoded_array = encoder.transform(df_linreg[nominal_cols_ohe])

# --- 3. Trasforma in DataFrame con nomi colonne corrette ---
encoded_df = pd.DataFrame(
    encoded_array,
    columns=encoder.get_feature_names_out(nominal_cols_ohe),
    index=df_linreg.index
)

# --- 4. Sostituisci colonne originali con quelle codificate ---
df_linreg_encoded = df_linreg.drop(columns=nominal_cols_ohe)
df_linreg_encoded = pd.concat([df_linreg_encoded, encoded_df], axis=1)

print(f"Dimensioni dopo OHE: Righe: {df_linreg_encoded.shape[0]}, Colonne: {df_linreg_encoded.shape[1]}")

print(df_linreg_encoded.head())


Dimensioni dopo OHE: Righe: 34728, Colonne: 103
    temp  rain_1h  snow_1h  clouds_all    year   lag_1  lag_24  lag_168  \
0   6.20      0.0      0.0        90.0  2012.0  5765.0  4832.0   5545.0   
1   8.22      0.0      0.0        90.0  2012.0  5599.0  4395.0   4516.0   
2   9.96      0.0      0.0        90.0  2012.0  4534.0  4411.0   4767.0   
3  10.93      0.0      0.0        75.0  2012.0  4691.0  4648.0   5026.0   
4  11.99      0.0      0.0        75.0  2012.0  5067.0  4602.0   4918.0   

   traffic_volume  hour_1.0  ...  weather_description_smoke  \
0          5599.0       0.0  ...                        0.0   
1          4534.0       0.0  ...                        0.0   
2          4691.0       0.0  ...                        0.0   
3          5067.0       0.0  ...                        0.0   
4          5030.0       0.0  ...                        0.0   

   weather_description_snow  weather_description_squalls  \
0                       0.0                          0.0   
1 

## Normalizzazione Z-Score

In [6]:
scaler = StandardScaler()
target_col = 'traffic_volume'

numeric_features_to_scale = ['temp', 'rain_1h','clouds_all' ,'snow_1h', 'year', 'lag_1', 'lag_24', 'lag_168']
# 2. Applica Standardizzazione
df_linreg_encoded[numeric_features_to_scale] = scaler.fit_transform(
    df_linreg_encoded[numeric_features_to_scale]
)
print(f"Features numeriche scalate: {numeric_features_to_scale}")
print(f"Esempio Standardizzazione (colonna '{numeric_features_to_scale[0]}'): Media={df_linreg_encoded[numeric_features_to_scale[0]].mean():.2f}, Dev. Std={df_linreg_encoded[numeric_features_to_scale[0]].std():.2f}")
#Sposta 'traffic_volume' come ultima feature
cols = df_linreg_encoded.columns.tolist()
cols.remove('traffic_volume')
cols.append('traffic_volume')
df_linreg_encoded = df_linreg_encoded[cols]

print(df_linreg_encoded)

Features numeriche scalate: ['temp', 'rain_1h', 'clouds_all', 'snow_1h', 'year', 'lag_1', 'lag_24', 'lag_168']
Esempio Standardizzazione (colonna 'temp'): Media=-0.00, Dev. Std=1.00
           temp   rain_1h  snow_1h  clouds_all      year     lag_1    lag_24  \
0     -0.183945 -0.096972 -0.01356    1.201313 -1.931739  1.264609  0.789552   
1     -0.030680 -0.096972 -0.01356    1.201313 -1.931739  1.180522  0.568989   
2      0.101340 -0.096972 -0.01356    1.201313 -1.931739  0.641045  0.577065   
3      0.174938 -0.096972 -0.01356    0.812231 -1.931739  0.720574  0.696684   
4      0.255364 -0.096972 -0.01356    0.812231 -1.931739  0.911037  0.673467   
...         ...       ...      ...         ...       ...       ...       ...   
34723  0.127138 -0.096972 -0.01356    0.812231  1.230157  0.343700  0.277766   
34724  0.074785 -0.096972 -0.01356    1.201313  1.230157  0.139054  0.035500   
34725  0.072508 -0.096972 -0.01356    1.201313  1.230157 -0.246938 -0.160332   
34726  0.023190 -0

# Pre Processing per modelli ad albero

Per i modelli basati su Alberi, abbiamo utilizzato l'Ordinal Encodier per le variabili nominali (hour, month, ecc.) e Z-Score Standardization per quelle numeriche. Questa scelta è dettata da:

- Efficienza: L'Ordinal Encoder è più efficiente in termini di memoria e velocità di calcolo rispetto all'OHE.

Utilizzo dell'ordinal encoder. Assegna un valore numerico a ciascuna categoria. Mostriamo prima i valori dei diversi attributi nominali

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

df_tree = data.copy()

# Definizioni delle feature
numeric_features = [
    'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'lag_1', 'lag_24', 'lag_168',
]
categorical_features = [
    'hour', 'month', 'day_of_week', 'holiday', 'weather_main','weather_description', 'is_weekend'
]

# Separa le Feature (X) dalla Target (Y)
X = df_tree.drop('traffic_volume', axis=1)
Y = df_tree['traffic_volume']

print("--- DataFrame Iniziale (X) ---")
print(X)
print("-" * 40)


# =================================================================
# PASSO 1: ORDINALE ENCODING (su colonne CATEGORICHE)
# =================================================================

# 1. Istanzia l'OrdinalEncoder
encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

# 2. Applica FIT & TRANSFORM alle colonne categoriche
# Il fit impara le categorie; il transform le converte in numeri interi.
X_encoded_array = encoder.fit_transform(X[categorical_features])

# 3. Aggiorna il DataFrame
# Trasformiamo l'array codificato in un DataFrame temporaneo
X_encoded_df = pd.DataFrame(
    X_encoded_array,
    columns=categorical_features,
    index=X.index
)

# 4. Ricombina i dati (sostituendo le colonne originali con quelle codificate)
X_step1 = X.copy()
X_step1[categorical_features] = X_encoded_df

print("--- Risultato Dopo Ordinal Encoding (Step 1) ---")
print("Le colonne categoriche sono ora numeri (es. 0.0, 1.0, 2.0)")
print(X_step1)
print("-" * 40)




--- DataFrame Iniziale (X) ---
      holiday   temp  rain_1h  snow_1h  clouds_all  weather_main  \
0        none   6.20      0.0      0.0        90.0        clouds   
1        none   8.22      0.0      0.0        90.0        clouds   
2        none   9.96      0.0      0.0        90.0        clouds   
3        none  10.93      0.0      0.0        75.0        clouds   
4        none  11.99      0.0      0.0        75.0        clouds   
...       ...    ...      ...      ...         ...           ...   
34723    none  10.30      0.0      0.0        75.0        clouds   
34724    none   9.61      0.0      0.0        90.0        clouds   
34725    none   9.58      0.0      0.0        90.0  thunderstorm   
34726    none   8.93      0.0      0.0        90.0        clouds   
34727    none   8.97      0.0      0.0        90.0        clouds   

          weather_description  hour  day_of_week  month    year  is_weekend  \
0             overcast clouds   9.0          1.0   10.0  2012.0       Fal

In [12]:
X_step1.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,hour,day_of_week,month,year,is_weekend,lag_1,lag_24,lag_168
0,7.0,6.2,0.0,0.0,90.0,1.0,16.0,9.0,1.0,9.0,2012.0,0.0,5765.0,4832.0,5545.0
1,7.0,8.22,0.0,0.0,90.0,1.0,16.0,10.0,1.0,9.0,2012.0,0.0,5599.0,4395.0,4516.0
2,7.0,9.96,0.0,0.0,90.0,1.0,16.0,11.0,1.0,9.0,2012.0,0.0,4534.0,4411.0,4767.0
3,7.0,10.93,0.0,0.0,75.0,1.0,0.0,12.0,1.0,9.0,2012.0,0.0,4691.0,4648.0,5026.0
4,7.0,11.99,0.0,0.0,75.0,1.0,0.0,13.0,1.0,9.0,2012.0,0.0,5067.0,4602.0,4918.0
