In [None]:
# Imports

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

In [2]:
# Load and explore data
df = pd.read_csv("../data/raw/traffic_volume_2021_plus.csv")

In [3]:
# Feature Engineering
df.rename(columns={'yr': 'year', 'm': 'month', 'd': 'day', 'hh': 'hour', 'mm': 'minute'}, inplace=True)
df['timestamp'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute']])

# Derived time features
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 16, 17, 18]).astype(int)

# Log-transform target variable
df['log_vol'] = np.log1p(df['vol'])  # log(1 + x)

In [4]:
# Define Features

# Numerical and categorical columns
numerical = ['hour', 'minute', 'dayofweek', 'is_weekend', 'is_rush_hour']
categorical = ['boro', 'direction', 'segmentid']

X = df[numerical + categorical]
y = df['log_vol']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
    ],
    remainder='passthrough'  # keep numerical columns as-is
)

In [5]:

# Model pipeline
model = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
)

# Train & Predict
model.fit(X_train, y_train)
y_pred_log = model.predict(X_test)


In [6]:
# Reverse log-transform
y_true = np.expm1(y_test)
y_pred = np.expm1(y_pred_log)


# Evaluation
mae = mean_absolute_error(y_true, y_pred)
print(f"Improved MAE: {mae:.2f}")

Improved MAE: 65.86
