# Lot-Level Occupancy Data Transformation (AMP Session-Based)

This notebook creates **lot-level occupancy estimates** using AMP parking session data.

**Why AMP Session-Based?**
- AMP session data covers 59 specific parking zones/lots
- Directly measures actual parking sessions (start/end times)
- More accurate occupancy counts than LPR scans
- Complements LPR-based approach (notebook 04) which covers 185 lots

**Approach:**
1. Load AMP parking session data
2. Count active sessions per hour for each specific zone
3. Add temporal, calendar, and weather features
4. Map capacity data to specific zones
5. Create train/val/test splits

**Note:** This creates lot-level data parallel to zone-level aggregations. The 59 specific zones map to the 28 aggregated zones used elsewhere in the project.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

print("="*80)
print("LOT-LEVEL OCCUPANCY TRANSFORMATION (AMP SESSION-BASED)")
print("="*80)
print()

## Step 1: Load AMP Session Data

In [None]:
print("Loading AMP session data...")
amp = pd.read_csv('../../data/processed/amp_preprocessed_clean.csv', parse_dates=['Start_Date', 'End_Date'])
print(f"Total parking sessions: {len(amp):,}")
print(f"Date range: {amp['Start_Date'].min()} to {amp['Start_Date'].max()}")

# The 'Zone' column contains SPECIFIC lot names (59 unique)
print(f"\nSpecific zones in raw data: {amp['Zone'].nunique()}")
print("\nTop 10 specific zones:")
print(amp['Zone'].value_counts().head(10))

## Step 2: Create Hourly Time Grid

In [None]:
print("Creating hourly time grid...")

start_date = amp['Start_Date'].min().date()
end_date = amp['Start_Date'].max().date()

# Create all (date, hour) combinations
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
hours = range(24)

time_grid = pd.MultiIndex.from_product(
    [date_range, hours],
    names=['date', 'hour']
).to_frame(index=False)

time_grid['datetime'] = (
    pd.to_datetime(time_grid['date']) +
    pd.to_timedelta(time_grid['hour'], unit='h')
)

print(f"Time grid created: {len(time_grid):,} hourly intervals")
print(f"Intervals per day: {len(hours)}")

## Step 3: Create Zone-Hour Grid

In [None]:
# Get unique specific zones (lot-level)
specific_zones = amp['Zone'].unique()
print(f"{len(specific_zones)} specific zones (lot-level) will be processed")

# Create zone-hour grid
print("\nCreating zone-hour grid...")
zone_hour_grid = pd.MultiIndex.from_product(
    [specific_zones, date_range, hours],
    names=['Zone', 'date', 'hour']
).to_frame(index=False)

zone_hour_grid['interval_start'] = (
    pd.to_datetime(zone_hour_grid['date']) +
    pd.to_timedelta(zone_hour_grid['hour'], unit='h')
)
zone_hour_grid['interval_end'] = zone_hour_grid['interval_start'] + pd.Timedelta(hours=1)

print(f"Zone-hour grid created: {len(zone_hour_grid):,} combinations")

## Step 4: Calculate Occupancy

In [None]:
print("Calculating lot-level occupancy...")

occupancy_list = []
zone_count = 0

for zone in specific_zones:
    zone_count += 1
    if zone_count % 10 == 0:
        print(f"Processing zone {zone_count}/{len(specific_zones)}: {zone}")

    # Filter sessions for this specific zone
    zone_sessions = amp[amp['Zone'] == zone].copy()
    zone_grid = zone_hour_grid[zone_hour_grid['Zone'] == zone].copy()

    # For each hour, count overlapping sessions
    for idx, row in zone_grid.iterrows():
        interval_start = row['interval_start']
        interval_end = row['interval_end']

        # Count active sessions during this hour
        active_sessions = zone_sessions[
            (zone_sessions['Start_Date'] < interval_end) &
            (zone_sessions['End_Date'] > interval_start)
        ]

        occupancy_list.append({
            'Zone': zone,
            'date': row['date'],
            'hour': row['hour'],
            'datetime': interval_start,
            'occupancy_count': len(active_sessions)
        })

occupancy_df = pd.DataFrame(occupancy_list)
print(f"\nLot-level occupancy data created: {len(occupancy_df):,} zone-hour records")
print(f"Unique specific zones: {occupancy_df['Zone'].nunique()}")

## Step 5: Add Temporal Features

In [None]:
print("Adding temporal features...")
occupancy_df['year'] = occupancy_df['datetime'].dt.year
occupancy_df['month'] = occupancy_df['datetime'].dt.month
occupancy_df['day'] = occupancy_df['datetime'].dt.day
occupancy_df['day_of_week'] = occupancy_df['datetime'].dt.dayofweek
occupancy_df['is_weekend'] = (occupancy_df['day_of_week'] >= 5).astype(int)

def categorize_time_of_day(hour):
    if 0 <= hour < 6:
        return 'Late Night'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

occupancy_df['time_of_day'] = occupancy_df['hour'].apply(categorize_time_of_day)
print("Temporal features added")

## Step 6: Merge Calendar Events

In [None]:
print("Merging calendar events...")
games = pd.read_csv('../../data/football_games.csv')
calendar = pd.read_csv('../../data/academic_calendar.csv')

games['Date'] = pd.to_datetime(games['Date'])
calendar['Start_Date'] = pd.to_datetime(calendar['Start_Date']).dt.normalize()
calendar['End_Date'] = pd.to_datetime(calendar['End_Date']).dt.normalize()

occupancy_df['date'] = pd.to_datetime(occupancy_df['date'])
game_dates = games['Date'].dt.normalize()
occupancy_df['is_game_day'] = occupancy_df['date'].isin(game_dates).astype(int)

# Calendar events
for event_type in ['Dead_Week', 'Finals_Week', 'Spring_Break', 'Thanksgiving_Break', 'Winter_Break']:
    event_periods = calendar[calendar['Event_Type'] == event_type]
    occupancy_df[f'is_{event_type.lower()}'] = 0

    for _, period in event_periods.iterrows():
        mask = (occupancy_df['date'] >= period['Start_Date']) & \
               (occupancy_df['date'] <= period['End_Date'])
        occupancy_df.loc[mask, f'is_{event_type.lower()}'] = 1

occupancy_df['is_any_break'] = (
    occupancy_df['is_spring_break'] |
    occupancy_df['is_thanksgiving_break'] |
    occupancy_df['is_winter_break']
).astype(int)

print("Calendar events merged")

## Step 7: Merge Weather Data

In [None]:
print("Merging weather data...")
weather = pd.read_csv('../../data/weather_pullman_2020_2025.csv')
weather['date'] = pd.to_datetime(weather['date']).dt.normalize()

occupancy_df = occupancy_df.merge(weather, left_on='date', right_on='date', how='left')
print("Weather data merged")

## Step 8: Load and Map Capacity Data

In [None]:
print("Loading capacity data...")
zone_capacity = pd.read_csv('../../data/zone_capacity.csv')
lot_mapping = pd.read_csv('../../data/lot_mapping_enhanced_with_coords.csv')

# Build capacity dict for specific zones
capacity_dict = {}

# Map from alternative descriptions
for _, row in lot_mapping.iterrows():
    alt_desc = row.get('alternative_location_description', '')
    if pd.notna(alt_desc) and alt_desc:
        for name in str(alt_desc).split('|'):
            name = name.strip()
            zone_name = row['Zone_Name']
            if zone_name in zone_capacity['Zone'].values:
                cap = zone_capacity[zone_capacity['Zone'] == zone_name]['Max_Capacity'].values[0]
                capacity_dict[name] = cap

# Also add direct mappings
for _, row in zone_capacity.iterrows():
    capacity_dict[row['Zone']] = row['Max_Capacity']

# Map capacities
occupancy_df['Max_Capacity'] = occupancy_df['Zone'].map(capacity_dict)

# Estimate capacity for zones without data (95th percentile)
zones_without_cap = occupancy_df[occupancy_df['Max_Capacity'].isna()]['Zone'].unique()
if len(zones_without_cap) > 0:
    print(f"Estimating capacity for {len(zones_without_cap)} zones...")
    for zone in zones_without_cap:
        zone_data = occupancy_df[occupancy_df['Zone'] == zone]['occupancy_count']
        estimated_cap = max(zone_data.quantile(0.95), 10)
        occupancy_df.loc[occupancy_df['Zone'] == zone, 'Max_Capacity'] = estimated_cap

# Calculate availability metrics
occupancy_df['occupancy_ratio'] = occupancy_df['occupancy_count'] / occupancy_df['Max_Capacity'].replace(0, 1)
occupancy_df['available_spaces'] = (occupancy_df['Max_Capacity'] - occupancy_df['occupancy_count']).clip(lower=0)
occupancy_df['is_near_full'] = (occupancy_df['occupancy_ratio'] >= 0.85).astype(int)
occupancy_df['is_very_full'] = (occupancy_df['occupancy_ratio'] >= 0.95).astype(int)

print("Capacity data mapped")

## Step 9: Create Train/Validation/Test Splits

In [None]:
print("Creating train/validation/test splits...")

train_end = pd.Timestamp('2024-08-31')
val_end = pd.Timestamp('2025-05-31')

occupancy_train = occupancy_df[occupancy_df['datetime'] <= train_end].copy()
occupancy_val = occupancy_df[(occupancy_df['datetime'] > train_end) & (occupancy_df['datetime'] <= val_end)].copy()
occupancy_test = occupancy_df[occupancy_df['datetime'] > val_end].copy()

print(f"Training: {len(occupancy_train):,} records ({occupancy_train['date'].min()} to {occupancy_train['date'].max()})")
print(f"Validation: {len(occupancy_val):,} records ({occupancy_val['date'].min()} to {occupancy_val['date'].max()})")
print(f"Test: {len(occupancy_test):,} records ({occupancy_test['date'].min()} to {occupancy_test['date'].max()})")

## Step 10: Save Data

In [None]:
print("Saving lot-level occupancy data (AMP-based)...")

occupancy_df.to_csv('../../data/processed/occupancy_lot_level_amp_full.csv', index=False)
occupancy_train.to_csv('../../data/processed/occupancy_lot_level_amp_train.csv', index=False)
occupancy_val.to_csv('../../data/processed/occupancy_lot_level_amp_val.csv', index=False)
occupancy_test.to_csv('../../data/processed/occupancy_lot_level_amp_test.csv', index=False)

print(f"Full lot-level data saved: {len(occupancy_df):,} records")
print(f"Train set saved: {len(occupancy_train):,} records")
print(f"Validation set saved: {len(occupancy_val):,} records")
print(f"Test set saved: {len(occupancy_test):,} records")

## Summary Statistics

In [None]:
print("="*80)
print("LOT-LEVEL DATA SUMMARY (AMP SESSION-BASED)")
print("="*80)
print(f"Specific zones (lots): {occupancy_df['Zone'].nunique()}")
print(f"Average occupancy: {occupancy_df['occupancy_count'].mean():.1f} cars")
print(f"Date range: {occupancy_df['date'].min()} to {occupancy_df['date'].max()}")
print("\nTop 10 busiest specific zones:")
print(occupancy_df.groupby('Zone')['occupancy_count'].mean().sort_values(ascending=False).head(10))

print("\n" + "="*80)
print("DONE!")
print("="*80)
print("\nNext steps:")
print("1. Train lot-level model using this data")
print("2. Compare with LPR-based lot-level predictions (notebook 04)")
print("3. Update API to support both AMP and LPR-based predictions")