# Univariate Random Forest Regressor Flow Predictions

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.api import VAR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor

## Select Features and Train Model

In [3]:
df = pd.read_csv("/content/preprocessingV5.csv", sep=";")

#df = df[df['PORTAL'] == 'E4S 58,140']
#df = df.sample(frac=0.05, random_state=42)

df.dropna(inplace=True)

df['Datetime'] = pd.to_datetime(df['Date']) + pd.to_timedelta(df['Interval_1'], unit='m')
df.set_index('Datetime', inplace=True)

features = [
    'FLOW_SUM',
    'SPEED_WEIGHTED_AVG',
    'FLOW_PREV_5_SUM',
    'FLOW_PREV_15_SUM',
    'FLOW_PREV_60_SUM',
    'SPEED_PREV_5_AVG',
    'SPEED_PREV_15_AVG',
    'SPEED_PREV_60_AVG',
    'IS_WEEKDAY',
    'Interval_30'
]

features_df = df[features].copy()
features_df['FLOW_NEXT_15_SUM'] = df['FLOW_NEXT_15_SUM']
features_df['SPEED_NEXT_15_AVG'] = df['SPEED_NEXT_15_AVG']

features_df.dropna(inplace=True)

X = features_df.drop(columns=['FLOW_NEXT_15_SUM', 'SPEED_NEXT_15_AVG'])
#y = features_df[['FLOW_NEXT_15_SUM', 'SPEED_NEXT_15_AVG']]
y = features_df['FLOW_NEXT_15_SUM']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=300, random_state=42)
#model = MultiOutputRegressor(RandomForestRegressor(n_estimators=300, random_state=42))
#model = LinearRegression()

model.fit(X_train, y_train)

In [None]:
from joblib import dump, load

#dump(model, 'best_univar_RFR_flow.joblib')
model = load('best_univar_RFR_flow.joblib')

## Make predictions

In [5]:
predictions = model.predict(X_test)
#predictions_df = pd.DataFrame(predictions, columns=['FLOW_NEXT_15_SUM', 'SPEED_NEXT_15_AVG'], index=y_test.index)
predictions_df = pd.DataFrame(predictions, columns=['FLOW_NEXT_15_SUM'], index=y_test.index)
predictions_df.head()

Unnamed: 0_level_0,FLOW_NEXT_15_SUM
Datetime,Unnamed: 1_level_1
2021-06-21 08:47:00,889.37
2021-06-10 06:30:00,902.14
2021-08-23 08:51:00,671.483333
2021-09-14 06:49:00,993.286667
2021-10-30 06:41:00,209.506667


# Evaluation Results

## Evaluation on Test Split

In [None]:
'''rmse_flow = np.sqrt(mean_squared_error(y_test['FLOW_NEXT_15_SUM'], predictions_df['FLOW_NEXT_15_SUM']))
mae_flow = mean_absolute_error(y_test['FLOW_NEXT_15_SUM'], predictions_df['FLOW_NEXT_15_SUM'])
r2_flow = r2_score(y_test['FLOW_NEXT_15_SUM'], predictions_df['FLOW_NEXT_15_SUM'])
mape_flow = np.mean(np.abs((y_test['FLOW_NEXT_15_SUM'] - predictions_df['FLOW_NEXT_15_SUM']) / y_test['FLOW_NEXT_15_SUM'])) * 100
explained_variance_flow = explained_variance_score(y_test['FLOW_NEXT_15_SUM'], predictions_df['FLOW_NEXT_15_SUM'])

print(f'FLOW_NEXT_15_SUM Evaluation Metrics:')
print(f'RMSE: {rmse_flow:.3f}')
print(f'MAE: {mae_flow:.3f}')
print(f'R²: {r2_flow:.3f}')
print(f'MAPE: {mape_flow:.3f}%')
print(f'Explained Variance: {explained_variance_flow:.3f}')

rmse_speed = np.sqrt(mean_squared_error(y_test['SPEED_NEXT_15_AVG'], predictions_df['SPEED_NEXT_15_AVG']))
mae_speed = mean_absolute_error(y_test['SPEED_NEXT_15_AVG'], predictions_df['SPEED_NEXT_15_AVG'])
r2_speed = r2_score(y_test['SPEED_NEXT_15_AVG'], predictions_df['SPEED_NEXT_15_AVG'])
mape_speed = np.mean(np.abs((y_test['SPEED_NEXT_15_AVG'] - predictions_df['SPEED_NEXT_15_AVG']) / y_test['SPEED_NEXT_15_AVG'])) * 100
explained_variance_speed = explained_variance_score(y_test['SPEED_NEXT_15_AVG'], predictions_df['SPEED_NEXT_15_AVG'])

print(f'\nSPEED_NEXT_15_AVG Evaluation Metrics:')
print(f'RMSE: {rmse_speed:.3f}')
print(f'MAE: {mae_speed:.3f}')
print(f'R²: {r2_speed:.3f}')
print(f'MAPE: {mape_speed:.3f}%')
print(f'Explained Variance: {explained_variance_speed:.3f}')'''


rmse_speed = np.sqrt(mean_squared_error(y_test, predictions_df))
mae_speed = mean_absolute_error(y_test, predictions_df)
r2_speed = r2_score(y_test, predictions_df)
mape_speed = np.mean(np.abs((y_test - predictions_df) / y_test)) * 100
explained_variance_speed = explained_variance_score(y_test, predictions_df)

print(f'\nSPEED_NEXT_15_AVG Evaluation Metrics:')
print(f'RMSE: {rmse_speed:.3f}')
print(f'MAE: {mae_speed:.3f}')
print(f'R²: {r2_speed:.3f}')
print(f'MAPE: {mape_speed:.3f}%')
print(f'Explained Variance: {explained_variance_speed:.3f}')

## Make Predictions on Full Final Evaluation Dataset

In [None]:
future_df = pd.read_csv("/content/evalpreproV3.csv", sep=';')
future_df.dropna(inplace=True)

#future_df = future_df[future_df['PORTAL'] == 'E4S 58,140']

future_df['Datetime'] = pd.to_datetime(future_df['Date']) + pd.to_timedelta(future_df['Interval_1'], unit='m')
future_df.set_index('Datetime', inplace=True)

features = [
    'FLOW_SUM',
    'SPEED_WEIGHTED_AVG',
    'FLOW_PREV_5_SUM',
    'FLOW_PREV_15_SUM',
    'FLOW_PREV_60_SUM',
    'SPEED_PREV_5_AVG',
    'SPEED_PREV_15_AVG',
    'SPEED_PREV_60_AVG',
    'IS_WEEKDAY',
    'Interval_30'
]

X_future = future_df[features].copy()

X_future.dropna(inplace=True)

future_predictions = model.predict(X_future)

#future_predictions_df = pd.DataFrame(future_predictions, columns=['FLOW_NEXT_15_SUM', 'SPEED_NEXT_15_AVG'])

#y_future_actual = future_df[['FLOW_NEXT_15_SUM', 'SPEED_NEXT_15_AVG']]
y_future_actual = future_df['FLOW_NEXT_15_SUM']

In [None]:
print(f"X_future shape: {X_future.shape}")
print(f"y_future_actual shape: {y_future_actual.shape}")

## Evaluation on Full Final Evaluation Dataset

In [None]:
# Evaluate the model with multiple metrics
# For FLOW_NEXT_15_SUM
rmse_flow = np.sqrt(mean_squared_error(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM']))
mae_flow = mean_absolute_error(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM'])
r2_flow = r2_score(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM'])
# mape_flow = np.mean(np.abs((y_future_actual['FLOW_NEXT_15_SUM'] - future_predictions_df['FLOW_NEXT_15_SUM']) / y_future_actual['FLOW_NEXT_15_SUM'])) * 100
explained_variance_flow = explained_variance_score(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM'])

print(f'FLOW_NEXT_15_SUM Evaluation Metrics:')
print(f'RMSE: {rmse_flow:.3f}')
print(f'MAE: {mae_flow:.3f}')
print(f'R²: {r2_flow:.3f}')
# print(f'MAPE: {mape_flow:.2f}%')
print(f'Explained Variance: {explained_variance_flow:.3f}')

# For SPEED_NEXT_15_AVG
rmse_speed = np.sqrt(mean_squared_error(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG']))
mae_speed = mean_absolute_error(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG'])
r2_speed = r2_score(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG'])
#mape_speed = np.mean(np.abs((y_future_actual['SPEED_NEXT_15_AVG'] - future_predictions_df['SPEED_NEXT_15_AVG']) / y_future_actual['SPEED_NEXT_15_AVG'])) * 100
explained_variance_speed = explained_variance_score(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG'])

print(f'\nSPEED_NEXT_15_AVG Evaluation Metrics:')
print(f'RMSE: {rmse_speed:.3f}')
print(f'MAE: {mae_speed:.3f}')
print(f'R²: {r2_speed:.3f}')
# print(f'MAPE: {mape_speed:.2f}%')
print(f'Explained Variance: {explained_variance_speed:.3f}')

## Make Predictions on Peak Final Evaluation Dataset

In [None]:
# Load the future dataset
future_df = pd.read_csv("/content/peakevalpreproV2.csv", sep=';')
future_df.dropna(inplace=True)

#future_df = future_df[future_df['PORTAL'] == 'E4S 58,140']

# Create the same 'Datetime' index for the new dataset
future_df['Datetime'] = pd.to_datetime(future_df['Date']) + pd.to_timedelta(future_df['Interval_1'], unit='m')
future_df.set_index('Datetime', inplace=True)

# Define features in the same way as the training dataset
features = [
    'FLOW_SUM',
    'SPEED_WEIGHTED_AVG',
    'FLOW_PREV_5_SUM',
    'FLOW_PREV_15_SUM',
    'FLOW_PREV_60_SUM',
    'SPEED_PREV_5_AVG',
    'SPEED_PREV_15_AVG',
    'SPEED_PREV_60_AVG',
    'IS_WEEKDAY',
    'Interval_30'
]

# Extract the features from the future dataset
X_future = future_df[features].copy()

# Drop rows with NaN values (if any)
X_future.dropna(inplace=True)

# Use the best trained model from the grid search
# Assuming `best_model` is already trained from the previous steps

# Make predictions on the future dataset
future_predictions = model.predict(X_future)

# Convert predictions to DataFrame
future_predictions_df = pd.DataFrame(future_predictions, columns=['FLOW_NEXT_15_SUM', 'SPEED_NEXT_15_AVG'])

# If you have the actual future target values available, load and align them with predictions
y_future_actual = future_df[['FLOW_NEXT_15_SUM', 'SPEED_NEXT_15_AVG']]


In [None]:
print(f"X_future shape: {X_future.shape}")
print(f"y_future_actual shape: {y_future_actual.shape}")

## Evaluation on Peak Final Evaluation Dataset

In [None]:
# Evaluate the model with multiple metrics
# For FLOW_NEXT_15_SUM
rmse_flow = np.sqrt(mean_squared_error(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM']))
mae_flow = mean_absolute_error(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM'])
r2_flow = r2_score(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM'])
#mape_flow = np.mean(np.abs((y_future_actual['FLOW_NEXT_15_SUM'] - future_predictions_df['FLOW_NEXT_15_SUM']) / y_future_actual['FLOW_NEXT_15_SUM'])) * 100
explained_variance_flow = explained_variance_score(y_future_actual['FLOW_NEXT_15_SUM'], future_predictions_df['FLOW_NEXT_15_SUM'])

print(f'FLOW_NEXT_15_SUM Evaluation Metrics:')
print(f'RMSE: {rmse_flow:.3f}')
print(f'MAE: {mae_flow:.3f}')
print(f'R²: {r2_flow:.3f}')
#print(f'MAPE: {mape_flow:.3f}%')
print(f'Explained Variance: {explained_variance_flow:.3f}')

# For SPEED_NEXT_15_AVG
rmse_speed = np.sqrt(mean_squared_error(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG']))
mae_speed = mean_absolute_error(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG'])
r2_speed = r2_score(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG'])
#mape_speed = np.mean(np.abs((y_future_actual['SPEED_NEXT_15_AVG'] - future_predictions_df['SPEED_NEXT_15_AVG']) / y_future_actual['SPEED_NEXT_15_AVG'])) * 100
explained_variance_speed = explained_variance_score(y_future_actual['SPEED_NEXT_15_AVG'], future_predictions_df['SPEED_NEXT_15_AVG'])

print(f'\nSPEED_NEXT_15_AVG Evaluation Metrics:')
print(f'RMSE: {rmse_speed:.3f}')
print(f'MAE: {mae_speed:.3f}')
print(f'R²: {r2_speed:.3f}')
#print(f'MAPE: {mape_speed:.3f}%')
print(f'Explained Variance: {explained_variance_speed:.3f}')

## Feature Importance

In [None]:
model.feature_importances_

## Save Model

In [None]:
from joblib import dump, load

#dump(model, 'best_univar_RFR_flow.joblib')

#loaded_model = load('random_forest_model.joblib')