In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Loaded the engineered features from last file 
df = pd.read_csv('engineered_features.csv', index_col='timestamp', parse_dates=True)

# Prepare features and target
features = ['hour', 'day_of_week', 'is_weekend', 'rolling_mean_24h', 'lag_24h']

X = df[features]

y = df['average']


In [83]:
# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

In [84]:
# Initialize Ridge regression model with regularization 
# This was implemented due to my model showing overfitting results

model = Ridge(alpha=1.0)


In [85]:
#Train and evaluate model

# Initialize empty lists to store performance metrics for each fold
mae_scores = []
rmse_scores = []
r2_scores = []


# Iterate through the time series cross-validation splits

for train_index, test_index in tscv.split(X):

    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate and store the Mean Absolute Error for this fold
    mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate and store the Root Mean Squared Error for this fold
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    # Calculate and store the R-squared score for this fold
    r2_scores.append(r2_score(y_test, y_pred))

In [86]:
# Print cross-validation results
print("Cross-validation results:")
print(f"MAE: {np.mean(mae_scores)} (+/- {np.std(mae_scores) * 2})")
print(f"RMSE: {np.mean(rmse_scores)} (+/- {np.std(rmse_scores) * 2})")
print(f"R2: {np.mean(r2_scores)} (+/- {np.std(r2_scores) * 2})")

Cross-validation results:
MAE: 0.46690541935135793 (+/- 0.09606575611546647)
RMSE: 0.5457191845473653 (+/- 0.1009501194415682)
R2: 0.7043633939940721 (+/- 0.11178730733353273)


In [87]:
# Train final model on all data
model.fit(X, y)


In [88]:
# Save the model
joblib.dump(model, 'traffic_flow_model.joblib')

['traffic_flow_model.joblib']

In [89]:
print("Model training completed. Model saved as 'traffic_flow_model.joblib'")


Model training completed. Model saved as 'traffic_flow_model.joblib'


In [90]:
# Feature importance

feature_importance = pd.DataFrame({'feature': features, 'importance': np.abs(model.coef_)})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nFeature Importance - ")

print(feature_importance)


Feature Importance - 
            feature  importance
3  rolling_mean_24h    0.987929
4           lag_24h    0.693397
0              hour    0.010394
1       day_of_week    0.000000
2        is_weekend    0.000000
