## 1. Setup and Imports

In [None]:
import sys
sys.path.append('UrbanEV-main/code')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

print("Libraries imported successfully!")

## 2. Load Data and Configuration

In [None]:
# Configuration parameters
class Args:
    def __init__(self):
        self.feat = 'occ'  # Feature type: occupancy
        self.pred_len = 3  # Prediction horizon (1 hour ahead)
        self.seq_len = 12  # Look-back window (12 hours)
        self.add_feat = 'None'  # No additional features
        self.pred_type = 'region'  # Predict all regions/stations
        self.fold = 6  # Cross-validation fold
        self.total_fold = 6  # Total number of folds (months)
        self.seed = 42

args = Args()

# Train-validation-test split ratios
TRAIN_RATIO, VAL_RATIO, TEST_RATIO = 0.8, 0.1, 0.1

print(f"Configuration:")
print(f"  Feature: {args.feat}")
print(f"  Prediction horizon: {args.pred_len} hour(s)")
print(f"  Look-back window: {args.seq_len} hours")
print(f"  Prediction type: {args.pred_type}")
print(f"  Train/Val/Test ratio: {TRAIN_RATIO}/{VAL_RATIO}/{TEST_RATIO}")

In [None]:
# Load station information
inf = pd.read_csv('UrbanEV-main/data/inf.csv', header=0, index_col=None)
print(f"Total stations: {len(inf)}")
print(f"\nStation information:")
print(inf.head())

In [None]:
# Load occupancy data
occ_df = pd.read_csv('UrbanEV-main/data/occupancy.csv', header=0, index_col=0)
time = pd.to_datetime(occ_df.index)

# Normalize occupancy by number of charging piles
charge_count_dict = dict(zip(inf['TAZID'].astype(str), inf['charge_count']))
for col in occ_df.columns:
    charge_count = charge_count_dict[col]
    occ_df[col] = occ_df[col] / charge_count

feat = np.array(occ_df)

print(f"Data shape: {feat.shape}")
print(f"Date range: {time.min()} to {time.max()}")
print(f"Number of stations: {feat.shape[1]}")
print(f"Number of time steps: {feat.shape[0]}")

## 3. Split Data (Train/Validation/Test)

In [None]:
# Split data based on fold
month_list = list(time.month.unique())
fold_time = time.month.isin(month_list[0:args.fold]).sum()

train_end = int(fold_time * TRAIN_RATIO)
valid_start = train_end
valid_end = int(valid_start + fold_time * VAL_RATIO)
test_start = valid_end
test_end = int(fold_time)

train_feat = feat[:train_end]
valid_feat = feat[valid_start:valid_end]
test_feat = feat[test_start:test_end]

print(f"Data split:")
print(f"  Training: {train_feat.shape} (indices 0 to {train_end})")
print(f"  Validation: {valid_feat.shape} (indices {valid_start} to {valid_end})")
print(f"  Test: {test_feat.shape} (indices {test_start} to {test_end})")
print(f"\nTime ranges:")
print(f"  Training: {time[0]} to {time[train_end-1]}")
print(f"  Validation: {time[valid_start]} to {time[valid_end-1]}")
print(f"  Test: {time[test_start]} to {time[test_end-1]}")

## 4. Initialize and Run Lo Model

In [None]:
# Lo model implementation
class Lo:
    def __init__(self, pred_len):
        self.pred_len = pred_len

    def predict(self, train_valid_occ, test_occ):
        """
        Use the latest observed value as the prediction for the next time step.
        For each prediction, use the value from pred_len steps before.
        """
        time_len, node = test_occ.shape
        preds = np.zeros((time_len, node))

        for j in range(node):
            for i in range(time_len):
                if i < self.pred_len:
                    # Use from training/validation data
                    preds[i, j] = train_valid_occ[-self.pred_len + i, j]
                else:
                    # Use from test data (actual observations)
                    preds[i, j] = test_occ[i - self.pred_len, j]

        return preds

# Initialize model
lo_model = Lo(pred_len=args.pred_len)
print(f"Lo model initialized with prediction horizon: {args.pred_len}")

In [None]:
# Prepare data for Lo model
# Combine train and valid data for the model's reference
train_valid_feat = np.vstack((train_feat, valid_feat, test_feat[:args.seq_len + args.pred_len, :]))
test_feat_adjusted = test_feat[args.pred_len + args.seq_len:, :]

print(f"Train+Valid data shape: {train_valid_feat.shape}")
print(f"Test data shape (adjusted): {test_feat_adjusted.shape}")

In [None]:
# Make predictions
predictions = lo_model.predict(train_valid_feat, test_feat_adjusted)

print(f"Predictions shape: {predictions.shape}")
print(f"Predictions range: [{predictions.min():.4f}, {predictions.max():.4f}]")
print(f"Actual values range: [{test_feat_adjusted.min():.4f}, {test_feat_adjusted.max():.4f}]")

## 5. Evaluate Model Performance

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Calculate metrics
def calculate_metrics(y_true, y_pred):
    """Calculate various regression metrics"""
    eps = 2e-2
    
    # Handle near-zero values for MAPE
    y_true_mape = y_true.copy()
    y_pred_mape = y_pred.copy()
    y_true_mape[np.where(y_true_mape <= eps)] = np.abs(y_true_mape[np.where(y_true_mape <= eps)]) + eps
    y_pred_mape[np.where(y_true_mape <= eps)] = np.abs(y_pred_mape[np.where(y_true_mape <= eps)]) + eps
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true_mape, y_pred_mape)
    rae = np.sum(np.abs(y_pred_mape - y_true_mape)) / np.sum(np.abs(np.mean(y_true_mape) - y_true_mape))
    
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape,
        'RAE': rae
    }

# Calculate overall metrics
metrics = calculate_metrics(test_feat_adjusted, predictions)

print("=" * 60)
print("LO MODEL PERFORMANCE - ALL STATIONS")
print("=" * 60)
for metric_name, metric_value in metrics.items():
    print(f"{metric_name:10s}: {metric_value:.6f}")
print("=" * 60)

## 6. Visualize Results - Overall Statistics

In [None]:
# Plot actual vs predicted scatter for all stations
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot
axes[0].scatter(test_feat_adjusted.flatten(), predictions.flatten(), 
               alpha=0.3, s=1, c='steelblue')
axes[0].plot([test_feat_adjusted.min(), test_feat_adjusted.max()], 
            [test_feat_adjusted.min(), test_feat_adjusted.max()], 
            'r--', linewidth=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Occupancy (Normalized)')
axes[0].set_ylabel('Predicted Occupancy (Normalized)')
axes[0].set_title('Actual vs Predicted - All Stations')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Residuals histogram
residuals = test_feat_adjusted.flatten() - predictions.flatten()
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Residuals (Actual - Predicted)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Residuals')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Mean residual: {residuals.mean():.6f}")
print(f"Std residual: {residuals.std():.6f}")

## 7. Visualize Results - Time Series for Selected Stations

In [None]:
# Select a few representative stations to visualize
selected_stations = [0, 50, 100, 150, 200, 250]  # Indices of stations to plot
station_ids = occ_df.columns[selected_stations]

# Time indices for test data
test_time = time[test_start + args.pred_len + args.seq_len:test_end]

fig, axes = plt.subplots(3, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, (station_idx, station_id) in enumerate(zip(selected_stations, station_ids)):
    ax = axes[idx]
    
    # Plot actual vs predicted
    ax.plot(test_time, test_feat_adjusted[:, station_idx], 
           label='Actual', linewidth=1.5, alpha=0.8, color='steelblue')
    ax.plot(test_time, predictions[:, station_idx], 
           label='Predicted', linewidth=1.5, alpha=0.8, color='coral', linestyle='--')
    
    # Calculate station-specific metrics
    station_mae = mean_absolute_error(test_feat_adjusted[:, station_idx], predictions[:, station_idx])
    
    ax.set_xlabel('Time')
    ax.set_ylabel('Occupancy (Normalized)')
    ax.set_title(f'Station {station_id} - MAE: {station_mae:.4f}')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 8. Per-Station Performance Analysis

In [None]:
# Calculate metrics for each station
station_metrics = []

for station_idx in range(feat.shape[1]):
    station_id = occ_df.columns[station_idx]
    station_mae = mean_absolute_error(test_feat_adjusted[:, station_idx], 
                                     predictions[:, station_idx])
    station_rmse = np.sqrt(mean_squared_error(test_feat_adjusted[:, station_idx], 
                                              predictions[:, station_idx]))
    
    station_metrics.append({
        'Station_ID': station_id,
        'MAE': station_mae,
        'RMSE': station_rmse
    })

metrics_df = pd.DataFrame(station_metrics)

print("Per-Station Performance Summary:")
print(metrics_df.describe())
print(f"\nTop 10 Best Performing Stations (Lowest MAE):")
print(metrics_df.nsmallest(10, 'MAE'))
print(f"\nTop 10 Worst Performing Stations (Highest MAE):")
print(metrics_df.nlargest(10, 'MAE'))

In [None]:
# Visualize distribution of station-level performance
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# MAE distribution
axes[0].hist(metrics_df['MAE'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(x=metrics_df['MAE'].mean(), color='red', linestyle='--', 
               linewidth=2, label=f'Mean: {metrics_df["MAE"].mean():.4f}')
axes[0].set_xlabel('MAE')
axes[0].set_ylabel('Number of Stations')
axes[0].set_title('Distribution of MAE Across All Stations')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# RMSE distribution
axes[1].hist(metrics_df['RMSE'], bins=30, edgecolor='black', alpha=0.7, color='coral')
axes[1].axvline(x=metrics_df['RMSE'].mean(), color='red', linestyle='--', 
               linewidth=2, label=f'Mean: {metrics_df["RMSE"].mean():.4f}')
axes[1].set_xlabel('RMSE')
axes[1].set_ylabel('Number of Stations')
axes[1].set_title('Distribution of RMSE Across All Stations')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Summary

In [None]:
print("=" * 70)
print("LO MODEL - FINAL SUMMARY")
print("=" * 70)
print(f"\nModel Configuration:")
print(f"  Prediction Horizon: {args.pred_len} hour(s)")
print(f"  Look-back Window: {args.seq_len} hours")
print(f"  Number of Stations: {feat.shape[1]}")
print(f"  Test Set Size: {test_feat_adjusted.shape[0]} time steps")

print(f"\nOverall Performance (All Stations):")
for metric_name, metric_value in metrics.items():
    print(f"  {metric_name:10s}: {metric_value:.6f}")

print(f"\nPer-Station Performance Statistics:")
print(f"  MAE - Mean: {metrics_df['MAE'].mean():.6f}, Std: {metrics_df['MAE'].std():.6f}")
print(f"  MAE - Min:  {metrics_df['MAE'].min():.6f}, Max: {metrics_df['MAE'].max():.6f}")
print(f"  RMSE - Mean: {metrics_df['RMSE'].mean():.6f}, Std: {metrics_df['RMSE'].std():.6f}")
print(f"  RMSE - Min:  {metrics_df['RMSE'].min():.6f}, Max: {metrics_df['RMSE'].max():.6f}")

print("\n" + "=" * 70)
print("The Lo model serves as a simple baseline that uses the most recent")
print("observation as the prediction. Despite its simplicity, it often performs")
print("well for short-term forecasting in stable time series.")
print("=" * 70)