In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os

import importlib
import regression_utils
importlib.reload(regression_utils)

from regression_utils import train_model

In [None]:
# Load all data
df_all = pd.read_csv("df_all.csv")
df_all['iso_timestamp'] = pd.to_datetime(df_all['iso_timestamp'], utc=True, format='mixed')

# Change columns to numerics
numeric_cols = [
    "channels_in", "channels_out", "channels_unknown", "channels_all",
    "site_temperature", "site_rain_accumulation", "site_snow_accumulation"]
df_all[numeric_cols] = df_all[numeric_cols].apply(pd.to_numeric, errors='coerce')

df_all["count"] = df_all[["channels_out", "channels_in", "channels_unknown"]].fillna(0).sum(axis=1)

print(f"Loaded {len(df_all):,} total records")
print(f"Cities: {df_all['domain_name'].nunique()}")
print(f"\nAvailable cities:")
for city in sorted(df_all['domain_name'].unique()):
    n_stations = df_all[df_all['domain_name'] == city]['counter_site'].nunique()
    print(f"  {city:40} ({n_stations:2} stations)")

In [None]:
# Load feature data
df_features = pd.read_csv('df_features.csv')
df_features['iso_timestamp'] = pd.to_datetime(df_features['iso_timestamp'], utc=True)

print(f"Loaded {len(df_features):,} feature records")

In [None]:
# Define cities to process (you can modify this list)
cities_to_process = [
    "Stadt Freiburg",
    "Stadt Heidelberg",
    "Landeshauptstadt Stuttgart",
    "Stadt Mannheim",
    "Stadt Reutlingen",
    "Stadt Konstanz",
    "Stadt Tübingen",
    "Stadt Ludwigsburg",
    "Ravensburg Tws Gmbh & Co. Kg"
]

# Outlier threshold
OUTLIER_THRESHOLD = 500

print(f"Will process {len(cities_to_process)} cities")
print(f"Outlier threshold: >{OUTLIER_THRESHOLD} hourly counts")

In [None]:
# Function to train model with optional outlier removal
def train_with_outlier_option(df_station, station_name, df_city, remove_outliers=False, threshold=500):
    """
    Train model with option to remove outliers
    
    Returns: results dict with additional 'n_train_samples' field, or None if failed
    """
    if remove_outliers:
        # Filter out outliers from station data
        df_station = df_station[df_station['channels_all'] <= threshold].copy()
        # Also filter city data to maintain consistency
        df_city = df_city[df_city['channels_all'] <= threshold].copy()
    
    if len(df_station) < 1000:
        return None
    
    results, model, features = train_model(df_station, station_name, df_city)
    
    if results is None:
        return None
    
    # Add training sample count
    # train_model uses 80/20 split
    # We need to replicate the same filtering logic to get accurate count
    from regression_utils import add_station_features
    
    # Get all stations data for pivot
    df_pivot_all = df_city.pivot_table(
        index='iso_timestamp',
        columns='counter_site',
        values='channels_all'
    )
    valid_timestamps = df_pivot_all.dropna().index
    df_station_filtered = df_station[df_station['iso_timestamp'].isin(valid_timestamps)].copy()
    
    # Check if we have any data after filtering
    if len(df_station_filtered) == 0:
        return None
    
    df_model = add_station_features(df_station_filtered, station_name, df_city)
    
    # Check if we have enough data for training
    if len(df_model) == 0:
        return None
    
    split_idx = int(len(df_model) * 0.8)
    
    # Skip if no training samples
    if split_idx == 0:
        return None
    
    results['n_train_samples'] = split_idx
    
    return results

In [None]:
################################################
# PROCESS ALL CITIES                           #
################################################

all_city_results = {}

for city_name in cities_to_process:
    print(f"\n{'='*80}")
    print(f"PROCESSING: {city_name}")
    print(f"{'='*80}")
    
    # Get city data
    df_city = df_features[df_features['domain_name'] == city_name].copy()
    
    if len(df_city) == 0:
        print(f"⊗ No data found for {city_name}")
        continue
    
    stations = df_city['counter_site'].unique()
    print(f"\nStations: {len(stations)}")
    print(f"Records: {len(df_city):,}")
    
    city_results = []
    
    for station_idx, station in enumerate(stations, 1):
        print(f"\n[{station_idx}/{len(stations)}] {station[:50]}")
        
        # Get station data
        df_station = df_city[df_city['counter_site'] == station].copy()
        df_station = df_station.dropna(subset=['channels_all'])
        df_station = df_station.sort_values('iso_timestamp')
        
        if len(df_station) < 1000:
            print(f"    ⊗ Skipped - only {len(df_station)} records")
            continue
        
        # Train WITHOUT outlier removal
        print("    Training WITH outliers...")
        results_with = train_with_outlier_option(df_station, station, df_city, remove_outliers=False)
        
        if results_with is None:
            print("    ⊗ Failed to train model with outliers")
            continue
        
        # Train WITH outlier removal
        print("    Training WITHOUT outliers (threshold >500)...")
        results_without = train_with_outlier_option(df_station, station, df_city, 
                                                      remove_outliers=True, threshold=OUTLIER_THRESHOLD)
        
        if results_without is None:
            print("    ⊗ Failed to train model without outliers")
            continue
        
        # Combine results
        combined = {
            'city': city_name,
            'station': station,
            'n_features': results_with['n_features'],
            'n_train_samples': results_with['n_train_samples'],
            'n_train_samples_no_outliers': results_without['n_train_samples'],
            # With outliers
            'test_r2_with': results_with['test_r2'],
            'test_rmse_with': results_with['test_rmse'],
            'test_mae_with': results_with['test_mae'],
            # Without outliers
            'test_r2_without': results_without['test_r2'],
            'test_rmse_without': results_without['test_rmse'],
            'test_mae_without': results_without['test_mae'],
            # Difference
            'delta_r2': results_without['test_r2'] - results_with['test_r2']
        }
        
        city_results.append(combined)
        
        print(f"    WITH outliers:    R²={results_with['test_r2']:.4f} | RMSE={results_with['test_rmse']:.2f} | n={results_with['n_train_samples']}")
        print(f"    WITHOUT outliers: R²={results_without['test_r2']:.4f} | RMSE={results_without['test_rmse']:.2f} | n={results_without['n_train_samples']}")
        print(f"    ΔR² = {combined['delta_r2']:+.4f}")
    
    all_city_results[city_name] = city_results
    
    # Print city summary
    if len(city_results) > 0:
        df_city_results = pd.DataFrame(city_results)
        print(f"\n{'-'*80}")
        print(f"SUMMARY - {city_name}")
        print(f"{'-'*80}")
        print(f"Successfully trained models for {len(city_results)} stations")
        print(f"\nAverage R² WITH outliers:    {df_city_results['test_r2_with'].mean():.4f} ± {df_city_results['test_r2_with'].std():.4f}")
        print(f"Average R² WITHOUT outliers: {df_city_results['test_r2_without'].mean():.4f} ± {df_city_results['test_r2_without'].std():.4f}")
        print(f"Average ΔR²:                 {df_city_results['delta_r2'].mean():+.4f} ± {df_city_results['delta_r2'].std():.4f}")

print(f"\n\n{'='*80}")
print("ALL CITIES PROCESSED")
print(f"{'='*80}")

In [None]:
################################################
# DISPLAY RESULTS PER CITY                     #
################################################

for city_name, city_results in all_city_results.items():
    if len(city_results) == 0:
        continue
    
    df_city_results = pd.DataFrame(city_results)
    
    print(f"\n{'='*120}")
    print(f"{city_name}")
    print(f"{'='*120}")
    print(f"\n| Station | R² (with) | RMSE (with) | MAE (with) | R² (no outl) | RMSE (no outl) | MAE (no outl) | ΔR² | Features | n_train |")
    print(f"|---------|-----------|-------------|------------|--------------|----------------|---------------|-----|----------|---------|")
    
    for _, row in df_city_results.iterrows():
        print(f"| {row['station'][:30]:30} | "
              f"{row['test_r2_with']:9.4f} | "
              f"{row['test_rmse_with']:11.2f} | "
              f"{row['test_mae_with']:10.2f} | "
              f"{row['test_r2_without']:12.4f} | "
              f"{row['test_rmse_without']:14.2f} | "
              f"{row['test_mae_without']:13.2f} | "
              f"{row['delta_r2']:+7.4f} | "
              f"{row['n_features']:8} | "
              f"{row['n_train_samples']:7} |")
    
    print(f"\nSummary Statistics:")
    print(f"  Avg R² (with outliers):    {df_city_results['test_r2_with'].mean():.4f} ± {df_city_results['test_r2_with'].std():.4f}")
    print(f"  Avg R² (no outliers):      {df_city_results['test_r2_without'].mean():.4f} ± {df_city_results['test_r2_without'].std():.4f}")
    print(f"  Avg ΔR²:                   {df_city_results['delta_r2'].mean():+.4f} ± {df_city_results['delta_r2'].std():.4f}")
    print(f"  Avg RMSE (with outliers):  {df_city_results['test_rmse_with'].mean():.2f} ± {df_city_results['test_rmse_with'].std():.2f}")
    print(f"  Avg RMSE (no outliers):    {df_city_results['test_rmse_without'].mean():.2f} ± {df_city_results['test_rmse_without'].std():.2f}")