# Occupancy Data Transformation
This notebook transforms individual parking session data into hourly occupancy predictions 
## The Shift in Approach
**Previous approach (notebooks 10-11):** Predict individual session durations
- Target: `duration_hours` per parking session
**New approach (this notebook):** Predict hourly occupancy rates
- Target: `occupancy_count` or `occupancy_rate` per zone per hour
- More predictable: Aggregate patterns smooth out individual variance
- Directly useful: "CUE Garage will be 85% full at 10am Monday"


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## Load AMP Data
Load the base preprocessed AMP parking session data (without enrichment).
Calendar and weather features will be added to the occupancy data later.

In [None]:
# Load AMP preprocessed CLEANED data (excludes B St, JumpTest, non-existent lots)
# This is the cleaned version from notebook 14
amp = pd.read_csv('../../data/processed/amp_preprocessed_clean.csv', parse_dates=['Start_Date', 'End_Date'])
print(f"Total parking sessions: {len(amp):,}")
print(f"Date range: {amp['Start_Date'].min()} to {amp['Start_Date'].max()}")
print(f"Unique zones: {amp['Zone'].nunique()}")
print(f"\nZones: {sorted(amp['Zone'].unique())}")

## Create 15-Minute Time Grid
Generate all possible (Date, Hour, Minute) combinations for the date range.
Each day has 96 intervals (24 hours × 4 intervals per hour).

In [None]:
# Get date range
start_date = amp['Start_Date'].min().date()
end_date = amp['Start_Date'].max().date()
# Create date range
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
# Create hour range (0-23) and minute intervals (0, 15, 30, 45)
hours = range(24)
minutes = [0, 15, 30, 45]
# Create all combinations of date, hour, and minute (15-min intervals)
time_grid = pd.MultiIndex.from_product(
    [date_range, hours, minutes],
    names=['date', 'hour', 'minute']
).to_frame(index=False)
# Create datetime column for easier processing
time_grid['datetime'] = (
    pd.to_datetime(time_grid['date']) + 
    pd.to_timedelta(time_grid['hour'], unit='h') +
    pd.to_timedelta(time_grid['minute'], unit='m')
)
print(f"Time grid created: {len(time_grid):,} 15-minute intervals")
print(f"Intervals per day: {len(hours) * len(minutes)}")
print(f"Date range: {time_grid['date'].min()} to {time_grid['date'].max()}")
print(f"\nFirst few rows:")
print(time_grid.head(10))

## Calculate Occupancy for Each 15-Minute Interval
For each zone and 15-minute interval, count how many parking sessions were active during that interval.
A session is "active" during an interval if:
- Start_Date <= interval_end AND End_Date >= interval_start

In [None]:
# Get unique zones
zones = amp['Zone'].unique()
print(f"Processing occupancy for {len(zones)} zones...")
print("This will take 20-40 minutes for 15-minute intervals...")
print(f"Processing {len(zones) * len(date_range) * 24 * 4:,} zone-interval combinations")
# Create zone-interval grid (15-minute intervals)
zone_interval_grid = pd.MultiIndex.from_product(
    [zones, date_range, hours, minutes],
    names=['Zone', 'date', 'hour', 'minute']
).to_frame(index=False)
# Create datetime columns for interval start and end
zone_interval_grid['interval_start'] = (
    pd.to_datetime(zone_interval_grid['date']) + 
    pd.to_timedelta(zone_interval_grid['hour'], unit='h') +
    pd.to_timedelta(zone_interval_grid['minute'], unit='m')
)
zone_interval_grid['interval_end'] = zone_interval_grid['interval_start'] + pd.Timedelta(minutes=15)
print(f"Zone-interval grid created: {len(zone_interval_grid):,} combinations")
print(f"\nCalculating occupancy counts for 15-minute intervals...")
print(f"Sample intervals:")
print(zone_interval_grid[['Zone', 'interval_start', 'interval_end']].head(8))

In [None]:
# Calculate occupancy using a vectorized approach
# For each zone-interval, count sessions that overlap with that 15-minute interval
occupancy_list = []
zone_count=0
for zone in zones:
    zone_count += 1
    print(f"Processing zone: {zone}: {round(zone_count/len(zones),2)} ({zone_count/len(zones)*100:.1f}%)")
    # Filter sessions for this zone
    zone_sessions = amp[amp['Zone'] == zone].copy()
    # Get time grid for this zone
    zone_grid = zone_interval_grid[zone_interval_grid['Zone'] == zone].copy()
    # For each 15-minute interval in the grid, count overlapping sessions
    for idx, row in zone_grid.iterrows():
        interval_start = row['interval_start']
        interval_end = row['interval_end']
        # Count sessions active during this 15-minute interval
        # Session overlaps if: session_start < interval_end AND session_end > interval_start
        active_sessions = zone_sessions[
            (zone_sessions['Start_Date'] < interval_end) & 
            (zone_sessions['End_Date'] > interval_start)
        ]
        occupancy_list.append({
            'Zone': zone,
            'date': row['date'],
            'hour': row['hour'],
            'minute': row['minute'],
            'datetime': interval_start,
            'occupancy_count': len(active_sessions)
        })
# Convert to DataFrame
occupancy_df = pd.DataFrame(occupancy_list)
print(f"\nOccupancy data created: {len(occupancy_df):,} zone-interval records (15-min granularity)")
print(f"Date range: {occupancy_df['date'].min()} to {occupancy_df['date'].max()}")
print(f"Sample data:")
print(occupancy_df.head(20))

## Add Temporal Features

In [None]:
# Extract temporal features
occupancy_df['year'] = occupancy_df['datetime'].dt.year
occupancy_df['month'] = occupancy_df['datetime'].dt.month
occupancy_df['day'] = occupancy_df['datetime'].dt.day
occupancy_df['day_of_week'] = occupancy_df['datetime'].dt.dayofweek
occupancy_df['day_name'] = occupancy_df['datetime'].dt.day_name()
occupancy_df['is_weekend'] = (occupancy_df['day_of_week'] >= 5).astype(int)
# Add minute_interval as a feature (0, 15, 30, 45)
occupancy_df['minute_interval'] = occupancy_df['minute']
# Time of day categories
def categorize_time_of_day(hour):
    if 0 <= hour < 6:
        return 'Late Night'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'
occupancy_df['time_of_day'] = occupancy_df['hour'].apply(categorize_time_of_day)
print("Temporal features added (including 15-minute intervals)")
print(f"\nSample data:")
print(occupancy_df[['Zone', 'datetime', 'hour', 'minute', 'occupancy_count']].head(10))

## Merge Calendar Events
Add game days, finals weeks, and breaks.

In [None]:
# Load calendar data
games = pd.read_csv('../../data/football_games.csv')
calendar = pd.read_csv('../../data/academic_calendar.csv')
# Parse dates
games['Date'] = pd.to_datetime(games['Date'])
calendar['Start_Date'] = pd.to_datetime(calendar['Start_Date'])
calendar['End_Date'] = pd.to_datetime(calendar['End_Date'])
# Convert occupancy_df['date'] to datetime for comparison
occupancy_df['date'] = pd.to_datetime(occupancy_df['date'])
# Mark game days
game_dates = games['Date'].dt.date.unique()
occupancy_df['is_game_day'] = occupancy_df['date'].dt.date.isin(game_dates).astype(int)
# Mark calendar events
for event_type in ['Dead_Week', 'Finals_Week', 'Spring_Break', 'Thanksgiving_Break', 'Winter_Break', 'University_Holiday']:
    event_periods = calendar[calendar['Event_Type'] == event_type]
    occupancy_df[f'is_{event_type.lower()}'] = 0
    for _, period in event_periods.iterrows():
        # Compare datetime columns directly
        mask = (occupancy_df['date'] >= period['Start_Date']) & \
               (occupancy_df['date'] <= period['End_Date'])
        occupancy_df.loc[mask, f'is_{event_type.lower()}'] = 1
# Create combined break indicator
occupancy_df['is_any_break'] = (
    occupancy_df['is_spring_break'] | 
    occupancy_df['is_thanksgiving_break'] | 
    occupancy_df['is_winter_break']
).astype(int)
print("Calendar events merged")
print(f"\nGame days: {occupancy_df['is_game_day'].sum():,}")
print(f"Dead weeks: {occupancy_df['is_dead_week'].sum():,}")
print(f"Finals weeks: {occupancy_df['is_finals_week'].sum():,}")
print(f"Break periods: {occupancy_df['is_any_break'].sum():,}")

## Merge Weather Data

In [None]:
# Load weather data
weather = pd.read_csv('../../data/weather_pullman_2020_2025.csv')
weather['date'] = pd.to_datetime(weather['date']).dt.date
# Convert occupancy_df['date'] to date object for merging with weather
occupancy_df['date_for_merge'] = occupancy_df['date'].dt.date
# Merge weather
occupancy_df = occupancy_df.merge(weather, left_on='date_for_merge', right_on='date', how='left')
# Drop the temporary merge column and the redundant date column from weather
occupancy_df = occupancy_df.drop(columns=['date_for_merge', 'date_y'])
occupancy_df = occupancy_df.rename(columns={'date_x': 'date'})
print("Weather data merged")
print(f"\nWeather columns added: {[col for col in weather.columns if col != 'date']}")

## Merge Camera Classification
Add fixed camera features from notebook 16.

In [None]:
# Load camera classification
try:
    camera_features = pd.read_csv('../../data/processed/fixed_camera_classification.csv')
    print(f"Camera classification loaded: {len(camera_features)} lots")
    # Convert Lot_number to string for merging
    camera_features['Lot_number'] = camera_features['Lot_number'].astype(str)
    # Create binary has_fixed_camera feature
    camera_features['has_fixed_camera'] = (camera_features['camera_classification_v2'] == 'FIXED_CAMERA_HIGH').astype(int)
    # Get lot-to-zone mapping
    lot_mapping = pd.read_csv('../../data/lot_mapping_enhanced.csv')
    lot_mapping['Lot_number'] = lot_mapping['Lot_number'].astype(str)
    # Merge camera with lot mapping to get zones
    camera_with_zones = camera_features.merge(
        lot_mapping[['Lot_number', 'Zone_Name']],
        on='Lot_number',
        how='left'
    )
    # Rename Zone_Name to Zone for consistency
    camera_with_zones = camera_with_zones.rename(columns={'Zone_Name': 'Zone'})
    # Aggregate to zone level (take max scores per zone)
    zone_camera_features = camera_with_zones.groupby('Zone').agg({
        'entrance_camera_score': 'max',
        'bulk_patrol_score': 'max',
        'avg_scans_per_active_hour': 'mean',
        'has_fixed_camera': 'max'
    }).reset_index()
    # Merge with occupancy data
    occupancy_df = occupancy_df.merge(zone_camera_features, on='Zone', how='left')
    # Fill NaN for zones without camera data
    occupancy_df['entrance_camera_score'] = occupancy_df['entrance_camera_score'].fillna(0)
    occupancy_df['bulk_patrol_score'] = occupancy_df['bulk_patrol_score'].fillna(0)
    occupancy_df['avg_scans_per_active_hour'] = occupancy_df['avg_scans_per_active_hour'].fillna(0)
    occupancy_df['has_fixed_camera'] = occupancy_df['has_fixed_camera'].fillna(0).astype(int)
    print(f"\n Camera features merged")
    print(f"  Zones with fixed cameras: {(occupancy_df.groupby('Zone')['has_fixed_camera'].first() == 1).sum()}")
except FileNotFoundError:
    print("⚠️ Camera classification file not found - skipping camera features")
    occupancy_df['entrance_camera_score'] = 0
    occupancy_df['bulk_patrol_score'] = 0
    occupancy_df['avg_scans_per_active_hour'] = 0
    occupancy_df['has_fixed_camera'] = 0

## Load Ground Truth Capacity & Calculate Availability Metrics
Use actual max capacity from Transportation Services instead of percentile estimates.

In [None]:
# Load ground truth capacity data from Transportation Services
print("="*70)
print("LOADING GROUND TRUTH CAPACITY")
print("="*70)
zone_capacity = pd.read_csv('../../data/zone_capacity.csv')
print(f"\nCapacity data loaded for {len(zone_capacity)} zones")
print(f"\nTop 10 zones by capacity:")
print(zone_capacity.nlargest(10, 'Max_Capacity'))
# Merge capacity data with occupancy - using a more memory-efficient approach
# Convert to dictionary for faster lookups
capacity_dict = dict(zip(zone_capacity['Zone'], zone_capacity['Max_Capacity']))
# Add Max_Capacity column using map (more memory efficient than merge)
occupancy_df['Max_Capacity'] = occupancy_df['Zone'].map(capacity_dict)
# Check for zones without capacity
zones_without_capacity = occupancy_df[occupancy_df['Max_Capacity'].isna()]['Zone'].unique()
if len(zones_without_capacity) > 0:
    print(f"\n⚠️ WARNING: {len(zones_without_capacity)} zones have no capacity data:")
    for zone in zones_without_capacity[:5]:
        print(f"  - {zone}")
    if len(zones_without_capacity) > 5:
        print(f"  ... and {len(zones_without_capacity) - 5} more")
    # For zones without capacity, fall back to 95th percentile as estimate
    print(f"\n  Using 95th percentile as fallback estimate for zones without capacity")
    for zone in zones_without_capacity:
        zone_data = occupancy_df[occupancy_df['Zone'] == zone]['occupancy_count']
        estimated_capacity = zone_data.quantile(0.95)
        occupancy_df.loc[occupancy_df['Zone'] == zone, 'Max_Capacity'] = max(estimated_capacity, 20)
# Calculate occupancy ratio using ACTUAL CAPACITY
occupancy_df['occupancy_ratio'] = occupancy_df['occupancy_count'] / occupancy_df['Max_Capacity'].replace(0, 1)
# Calculate available spaces (NEW - this is what users want to see!)
occupancy_df['available_spaces'] = (occupancy_df['Max_Capacity'] - occupancy_df['occupancy_count']).clip(lower=0)
# Binary indicators
occupancy_df['is_near_full'] = (occupancy_df['occupancy_ratio'] >= 0.85).astype(int)
occupancy_df['is_very_full'] = (occupancy_df['occupancy_ratio'] >= 0.95).astype(int)
occupancy_df['availability_score'] = (1 - occupancy_df['occupancy_ratio']).clip(0, 1)
# Categorize availability
def categorize_availability(ratio):
    if ratio >= 0.95:
        return 'VERY_LOW'  # 95%+ full - very hard to find spot
    elif ratio >= 0.80:
        return 'LOW'  # 80-95% full - hard to find spot
    elif ratio >= 0.60:
        return 'MODERATE'  # 60-80% full - moderate chance
    elif ratio >= 0.40:
        return 'HIGH'  # 40-60% full - good chance
    else:
        return 'VERY_HIGH'  # <40% full - very easy
occupancy_df['availability_category'] = occupancy_df['occupancy_ratio'].apply(categorize_availability)
print(f"\n Availability metrics calculated using GROUND TRUTH CAPACITY")
print(f"\nCapacity statistics:")
print(f"  Average zone capacity: {occupancy_df.groupby('Zone')['Max_Capacity'].first().mean():.0f} spaces")
print(f"  Total parking capacity: {occupancy_df.groupby('Zone')['Max_Capacity'].first().sum():.0f} spaces")
print(f"\nAvailability distribution:")
print(occupancy_df['availability_category'].value_counts().sort_index())

In [None]:
# Summary statistics
print("="*70)
print("OCCUPANCY SUMMARY STATISTICS")
print("="*70)
print(f"\nTotal zone-interval observations: {len(occupancy_df):,} (15-minute granularity)")

## Merge Camera Classification
Add fixed camera features from notebook 16.

In [None]:
# Load camera classification
try:
    camera_features = pd.read_csv('../../data/processed/fixed_camera_classification.csv')
    print(f"Camera classification loaded: {len(camera_features)} lots")
    # Convert Lot_number to string for merging
    camera_features['Lot_number'] = camera_features['Lot_number'].astype(str)
    # Create binary has_fixed_camera feature from classification
    camera_features['has_fixed_camera'] = (camera_features['camera_classification_v2'] == 'FIXED_CAMERA_HIGH').astype(int)
    # Get lot-to-zone mapping from lot_mapping_enhanced.csv (single source of truth)
    "    lot_mapping = pd.read_csv('../../data/lot_mapping_enhanced.csv')\n",
    lot_mapping['Lot_number'] = lot_mapping['Lot_number'].astype(str)
    # Merge camera with lot mapping to get zones
    camera_with_zones = camera_features.merge(
        lot_mapping[['Lot_number', 'Zone_Name']],
        on='Lot_number',
        how='left'
    )
    # Rename Zone_Name to Zone for consistency
    camera_with_zones = camera_with_zones.rename(columns={'Zone_Name': 'Zone'})
    # Drop rows where Zone is NaN (lots not in mapping)
    camera_with_zones = camera_with_zones.dropna(subset=['Zone'])
    print(f"  Lots matched to zones: {len(camera_with_zones)}")
    # Aggregate to zone level (take max scores per zone)
    zone_camera_features = camera_with_zones.groupby('Zone').agg({
        'entrance_camera_score': 'max',
        'bulk_patrol_score': 'max',
        'avg_scans_per_active_hour': 'mean',
        'has_fixed_camera': 'max'
    }).reset_index()
    print(f"  Unique zones with camera data: {len(zone_camera_features)}")
    # Merge with occupancy data
    occupancy_df = occupancy_df.merge(zone_camera_features, on='Zone', how='left')
    # Fill NaN for zones without camera data (use .get() to safely access columns)
    for col in ['entrance_camera_score', 'bulk_patrol_score', 'avg_scans_per_active_hour']:
        if col in occupancy_df.columns:
            occupancy_df[col] = occupancy_df[col].fillna(0)
        else:
            occupancy_df[col] = 0
    if 'has_fixed_camera' in occupancy_df.columns:
        occupancy_df['has_fixed_camera'] = occupancy_df['has_fixed_camera'].fillna(0).astype(int)
    else:
        occupancy_df['has_fixed_camera'] = 0
    print(f"\n Camera features merged")
    print(f"  Zones with fixed cameras: {(occupancy_df.groupby('Zone')['has_fixed_camera'].first() == 1).sum()}")
except FileNotFoundError:
    print("⚠️ Camera classification file not found - skipping camera features")
    occupancy_df['entrance_camera_score'] = 0
    occupancy_df['bulk_patrol_score'] = 0
    occupancy_df['avg_scans_per_active_hour'] = 0
    occupancy_df['has_fixed_camera'] = 0

## Calculate Zone Baselines & Availability Metrics
Create "typical full" baselines using percentiles.

In [None]:
# Summary statistics
print("="*70)
print("OCCUPANCY SUMMARY STATISTICS")
print("="*70)
print(f"\nTotal zone-hour observations: {len(occupancy_df):,}")
print(f"\nOccupancy count distribution:")
print(occupancy_df['occupancy_count'].describe())
print(f"\n\nBusiest zones (average occupancy):")
zone_avg = occupancy_df.groupby('Zone')['occupancy_count'].mean().sort_values(ascending=False)
print(zone_avg.head(10))
print(f"\n\nPeak hours (average occupancy):")
hour_avg = occupancy_df.groupby('hour')['occupancy_count'].mean().sort_values(ascending=False)
print(hour_avg.head(10))
print(f"\n\nNear-full frequency by zone (% of time zone is 85%+ full):")
near_full_freq = occupancy_df.groupby('Zone')['is_near_full'].mean().sort_values(ascending=False)
print((near_full_freq.head(10) * 100).round(2))

In [None]:
# Visualize occupancy patterns
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
# 1. Average occupancy by hour
hour_avg.plot(kind='bar', ax=axes[0,0], color='steelblue')
axes[0,0].set_title('Average Occupancy by Hour of Day', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('Hour')
axes[0,0].set_ylabel('Average Cars Parked')
axes[0,0].grid(axis='y', alpha=0.3)
# 2. Average occupancy by day of week
day_avg = occupancy_df.groupby('day_of_week')['occupancy_count'].mean()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0,1].bar(range(7), day_avg, color='coral')
axes[0,1].set_xticks(range(7))
axes[0,1].set_xticklabels(day_names)
axes[0,1].set_title('Average Occupancy by Day of Week', fontsize=14, fontweight='bold')
axes[0,1].set_ylabel('Average Cars Parked')
axes[0,1].grid(axis='y', alpha=0.3)
# 3. Top zones by occupancy
zone_avg.head(10).plot(kind='barh', ax=axes[1,0], color='teal')
axes[1,0].set_title('Top 10 Zones by Average Occupancy', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('Average Cars Parked')
axes[1,0].grid(axis='x', alpha=0.3)
# 4. Game day vs regular day
game_comparison = occupancy_df.groupby(['is_game_day', 'hour'])['occupancy_count'].mean().unstack(0)
game_comparison.plot(ax=axes[1,1], marker='o')
axes[1,1].set_title('Game Day vs Regular Day Occupancy', fontsize=14, fontweight='bold')
axes[1,1].set_xlabel('Hour of Day')
axes[1,1].set_ylabel('Average Cars Parked')
axes[1,1].legend(['Regular Day', 'Game Day'])
axes[1,1].grid(alpha=0.3)
plt.tight_layout()
plt.savefig('../../data/processed/occupancy_patterns.png', dpi=300, bbox_inches='tight')
plt.show()
print("Occupancy patterns visualization saved to: data/processed/occupancy_patterns.png")

## Create Train/Validation/Test Splits
Using the same time-based splits as before:
- Training: 2020-08 to 2023-12 (~70%)
- Validation: 2024-01 to 2024-08 (~15%)
- Test: 2024-09 to 2025-11 (~15%)

In [None]:
# Define split dates - UPDATED to include 2024 data in training
train_end = pd.Timestamp('2024-08-31')
val_end = pd.Timestamp('2025-05-31')
# Create splits
occupancy_train = occupancy_df[occupancy_df['datetime'] <= train_end].copy()
occupancy_val = occupancy_df[(occupancy_df['datetime'] > train_end) & (occupancy_df['datetime'] <= val_end)].copy()
occupancy_test = occupancy_df[occupancy_df['datetime'] > val_end].copy()
print("="*70)
print("DATA SPLITS (UPDATED WITH 2024 IN TRAINING)")
print("="*70)
print(f"\nTraining set:")
print(f"  Records: {len(occupancy_train):,}")
print(f"  Date range: {occupancy_train['date'].min()} to {occupancy_train['date'].max()}")
print(f"  Avg occupancy: {occupancy_train['occupancy_count'].mean():.2f}")
print(f"\nValidation set:")
print(f"  Records: {len(occupancy_val):,}")
print(f"  Date range: {occupancy_val['date'].min()} to {occupancy_val['date'].max()}")
print(f"  Avg occupancy: {occupancy_val['occupancy_count'].mean():.2f}")
print(f"\nTest set:")
print(f"  Records: {len(occupancy_test):,}")
print(f"  Date range: {occupancy_test['date'].min()} to {occupancy_test['date'].max()}")
print(f"  Avg occupancy: {occupancy_test['occupancy_count'].mean():.2f}")
# Show year distribution
print(f"\nYear distribution in training set:")
train_years = occupancy_train.groupby(occupancy_train['datetime'].dt.year).size()
for year, count in train_years.items():
    print(f"  {year}: {count:,} records ({count/len(occupancy_train)*100:.1f}%)")

## Save Processed Data

In [None]:
# Save full occupancy data
occupancy_df.to_csv('../../data/processed/occupancy_full.csv', index=False)
print(f"Full occupancy data saved: {len(occupancy_df):,} records")
# Save splits
occupancy_train.to_csv('../../data/processed/occupancy_train.csv', index=False)
occupancy_val.to_csv('../../data/processed/occupancy_val.csv', index=False)
occupancy_test.to_csv('../../data/processed/occupancy_test.csv', index=False)
print(f"\nTrain set saved: {len(occupancy_train):,} records")
print(f"Validation set saved: {len(occupancy_val):,} records")
print(f"Test set saved: {len(occupancy_test):,} records")