# Historic Data Integration (2022-2024)

This notebook consolidates 36 months of historic transport data (January 2022 - December 2024) into a unified dataset.

## Objectives
1. Load 36 monthly files from 2022-2024
2. Standardize column names across years
3. Apply filtering rules (exclude Lager and B&T pickup orders)
4. Apply feature engineering (Betriebszentralen, Sparten, temporal features)
5. Save consolidated dataset for time series forecasting

## Data Coverage
- **2022**: 1,700,564 records (12 months)
- **2023**: 1,685,937 records (12 months)
- **2024**: 1,684,729 records (12 months)
- **Total**: 5,071,230 records before filtering

In [1]:
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Add utils to path
sys.path.append(str(Path.cwd().parent))
from utils.traveco_utils import TravecomDataCleaner, TravecomFeatureEngine

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [2]:
# Initialize config once for all sections
from pathlib import Path
from utils.traveco_utils import ConfigLoader

config_path = Path('../config/config.yaml')
config = ConfigLoader(config_path=str(config_path))
print(f"✓ Config loaded from: {config_path.resolve()}")

✓ Config loaded from: /Users/kk/dev/customer_traveco/config/config.yaml


In [3]:
# Quick reload
import pickle
from pathlib import Path
import pandas as pd

raw_pickle = Path('../data/processed/historic_orders_2022_2024_RAW.pkl')
df_historic = pd.read_pickle(raw_pickle)
print(f"✓ Loaded {len(df_historic):,} records!")

✓ Loaded 4,935,622 records!


## Section 1: Multi-Year Data Loader

Load all 36 monthly files from 2022-2024 and consolidate into a single DataFrame.

In [4]:
def load_historic_data(years=[2022, 2023, 2024], base_path='../data/raw'):
    """
    Load and consolidate multi-year historic data.
    
    Parameters:
    -----------
    years : list
        Years to load (default: [2022, 2023, 2024])
    base_path : str
        Base path to raw data folders
    
    Returns:
    --------
    pd.DataFrame
        Consolidated dataframe with all historic data
    """
    all_data = []
    file_count = 0
    
    for year in years:
        year_path = Path(base_path) / str(year)
        print(f"\n{'='*80}")
        print(f"Loading {year} data...")
        print(f"{'='*80}")
        
        # Get all order analysis files
        files = sorted([f for f in os.listdir(year_path) 
                       if 'QS Auftragsanalyse' in f and f.endswith('.xlsx')])
        
        print(f"Found {len(files)} monthly files for {year}")
        
        year_data = []
        for f in tqdm(files, desc=f"{year}"):
            filepath = year_path / f
            
            try:
                # Load Excel file
                df = pd.read_excel(filepath)
                
                # Add source tracking
                df['source_file'] = f
                df['source_year'] = year
                
                year_data.append(df)
                file_count += 1
                
            except Exception as e:
                print(f"\n  ❌ ERROR loading {f}: {e}")
                continue
        
        # Concatenate year data
        if year_data:
            year_df = pd.concat(year_data, ignore_index=True)
            print(f"  ✓ {year}: {len(year_df):,} records loaded")
            all_data.append(year_df)
    
    # Consolidate all years
    print(f"\n{'='*80}")
    print(f"Consolidating all years...")
    print(f"{'='*80}")
    
    df_all = pd.concat(all_data, ignore_index=True)
    
    print(f"\n✓ Total files loaded: {file_count}")
    print(f"✓ Total records: {len(df_all):,}")
    print(f"✓ Columns: {len(df_all.columns)}")
    print(f"✓ Memory usage: {df_all.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
    
    return df_all

In [5]:
# Quick load: Skip Excel loading if processed file exists
processed_path = Path('../data/processed/historic_orders_2022_2024.parquet')

if processed_path.exists():
    print("Found existing processed data. Loading from Parquet...")
    df_historic = pd.read_parquet(processed_path)
    print(f"✓ Loaded {len(df_historic):,} records in seconds!")
    print(f"✓ Memory: {df_historic.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
    
    # Skip to Section 6 (validation) or Section 8 (preview)
    SKIP_LOADING = True
else:
    print("Processed data not found. Will load from raw Excel files...")
    SKIP_LOADING = False


if not SKIP_LOADING:
    # Load all historic data (30 minutes)
    df_historic = load_historic_data()

print(f"\nDataset shape: {df_historic.shape}")
print(f"Date range: {df_historic['Datum.Tour'].min()} to {df_historic['Datum.Tour'].max()}")

Found existing processed data. Loading from Parquet...
✓ Loaded 4,937,096 records in seconds!
✓ Memory: 29.59 GB

Dataset shape: (4937096, 121)
Date range: 2022-01-01 00:00:00 to 2024-12-31 00:00:00


## Section 2: Column Name Standardization

Handle minor schema differences between years.

In [6]:
def standardize_columns(df):
    """
    Standardize column names to match 2025 schema.
    
    Changes:
    - RKdNr. → RKdNr (remove trailing dot)
    - Drop 'B&T Lt' (only in 2022-2024, not in 2025)
    """
    df_clean = df.copy()
    
    # Rename columns
    rename_map = {}
    
    if 'RKdNr.' in df_clean.columns:
        rename_map['RKdNr.'] = 'RKdNr'
    
    if rename_map:
        df_clean = df_clean.rename(columns=rename_map)
        print(f"✓ Renamed columns: {list(rename_map.keys())}")
    
    # Drop columns not in 2025
    drop_cols = [col for col in ['B&T Lt'] if col in df_clean.columns]
    if drop_cols:
        df_clean = df_clean.drop(columns=drop_cols)
        print(f"✓ Dropped columns: {drop_cols}")
    
    print(f"\nFinal column count: {len(df_clean.columns)}")
    
    return df_clean

# Apply standardization
df_historic = standardize_columns(df_historic)


Final column count: 121


In [7]:
# EMERGENCY SAVE using Pickle (handles mixed types)
print("\n" + "="*80)
print("SAVING RAW LOADED DATA (using Pickle)...")
print("="*80)

output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Use Pickle - it handles mixed types without issues
raw_pickle = output_dir / 'historic_orders_2022_2024_RAW.pkl'
print(f"Saving to {raw_pickle}...")
df_historic.to_pickle(raw_pickle)
file_size = raw_pickle.stat().st_size / 1024**2
print(f"✓ Saved Pickle: {file_size:.1f} MB")

print(f"\n{'='*80}")
print("RAW DATA SAVED!")
print(f"Reload with: df_historic = pd.read_pickle('{raw_pickle}')")
print("="*80)


SAVING RAW LOADED DATA (using Pickle)...
Saving to ../data/processed/historic_orders_2022_2024_RAW.pkl...
✓ Saved Pickle: 3374.9 MB

RAW DATA SAVED!
Reload with: df_historic = pd.read_pickle('../data/processed/historic_orders_2022_2024_RAW.pkl')


## Section 3: Data Validation

Verify data quality before filtering.

In [8]:
print("Data Quality Checks:")
print("="*80)

# Check date column
print(f"\n1. Date Range:")
print(f"   Min: {df_historic['Datum.Tour'].min()}")
print(f"   Max: {df_historic['Datum.Tour'].max()}")
print(f"   Missing: {df_historic['Datum.Tour'].isna().sum():,}")

# Check key columns
key_cols = ['Nummer.Auftraggeber', 'Lieferart 2.0', 'System_id.Auftrag', 
            'Nummer.Spedition', 'Distanz_BE.Auftrag']

print(f"\n2. Key Column Completeness:")
for col in key_cols:
    if col in df_historic.columns:
        missing = df_historic[col].isna().sum()
        pct = (missing / len(df_historic)) * 100
        print(f"   {col}: {missing:,} missing ({pct:.2f}%)")
    else:
        print(f"   ❌ {col}: NOT FOUND")

# Monthly distribution
print(f"\n3. Monthly Distribution:")
df_historic['year_month'] = pd.to_datetime(df_historic['Datum.Tour']).dt.to_period('M')
monthly_counts = df_historic['year_month'].value_counts().sort_index()
print(f"   Total months: {len(monthly_counts)}")
print(f"   Avg records/month: {monthly_counts.mean():,.0f}")
print(f"   Min: {monthly_counts.min():,} ({monthly_counts.idxmin()})")
print(f"   Max: {monthly_counts.max():,} ({monthly_counts.idxmax()})")

Data Quality Checks:

1. Date Range:
   Min: 2022-01-01 00:00:00
   Max: 2024-12-31 00:00:00
   Missing: 0

2. Key Column Completeness:
   Nummer.Auftraggeber: 6 missing (0.00%)
   Lieferart 2.0: 0 missing (0.00%)
   System_id.Auftrag: 0 missing (0.00%)
   Nummer.Spedition: 0 missing (0.00%)
   Distanz_BE.Auftrag: 0 missing (0.00%)

3. Monthly Distribution:
   Total months: 36
   Avg records/month: 137,142
   Min: 126,066 (2023-04)
   Max: 154,679 (2022-03)


## Section 4: Apply Filtering Rules

Exclude irrelevant orders:
1. **Lager Aufträge** (warehouse orders)
2. **B&T Abholaufträge** (B&T internal pickups)

In [9]:
print("Applying Filtering Rules...")
print("="*80)

# Initialize cleaner with correct config path
from pathlib import Path
config_path = Path('../config/config.yaml')  # Relative to notebooks/ directory

# Import ConfigLoader if not already imported
from utils.traveco_utils import ConfigLoader, TravecomDataCleaner

# Initialize cleaner with explicit config path
config = ConfigLoader(config_path=str(config_path))
cleaner = TravecomDataCleaner(config=config)

# Apply filters
df_filtered = cleaner.apply_filtering_rules(df_historic)

# Summary
print(f"\nFiltering Summary:")
print(f"  Original records: {len(df_historic):,}")
print(f"  Filtered records: {len(df_filtered):,}")
print(f"  Excluded: {len(df_historic) - len(df_filtered):,} ({((len(df_historic) - len(df_filtered)) / len(df_historic)) * 100:.2f}%)")
print(f"  Retention rate: {(len(df_filtered) / len(df_historic)) * 100:.2f}%")

Applying Filtering Rules...

✂️  Applying Filtering Rules (Christian's Feedback - Oct 2025):
   Starting orders: 4,937,096
   ℹ️  No B&T pickup orders found (already filtered or not present)

   ✓ No orders filtered (all 4,937,096 orders passed filtering rules)

Filtering Summary:
  Original records: 4,937,096
  Filtered records: 4,937,096
  Excluded: 0 (0.00%)
  Retention rate: 100.00%


## Section 5: Feature Engineering

Apply the same feature engineering pipeline used for June 2025 data.

In [10]:
print("Applying Feature Engineering...")
print("="*80)

# Start with filtered data
df_featured = df_filtered.copy()

# ==============================================================================
# 1. Temporal Features
# ==============================================================================
print("\n1. Extracting temporal features...")
df_featured['Datum.Tour'] = pd.to_datetime(df_featured['Datum.Tour'])

df_featured['year'] = df_featured['Datum.Tour'].dt.year
df_featured['month'] = df_featured['Datum.Tour'].dt.month
df_featured['quarter'] = df_featured['Datum.Tour'].dt.quarter
df_featured['week'] = df_featured['Datum.Tour'].dt.isocalendar().week
df_featured['day_of_year'] = df_featured['Datum.Tour'].dt.dayofyear
df_featured['weekday'] = df_featured['Datum.Tour'].dt.weekday
df_featured['is_weekend'] = df_featured['weekday'].isin([5, 6]).astype(int)

print(f"   ✓ Added: year, month, quarter, week, day_of_year, weekday, is_weekend")

# ==============================================================================
# 2. Carrier Type Classification
# ==============================================================================
print("\n2. Identifying carrier types...")

INTERNAL_MAX = 8889
EXTERNAL_MIN = 9000

def classify_carrier(carrier_num):
    """Classify carrier as internal, external, or unknown"""
    if pd.isna(carrier_num):
        return 'unknown'
    
    try:
        carrier_num = float(str(carrier_num).replace('-', '').strip())
    except (ValueError, AttributeError):
        return 'unknown'
    
    if carrier_num <= INTERNAL_MAX:
        return 'internal'
    elif carrier_num >= EXTERNAL_MIN:
        return 'external'
    else:
        return 'unknown'

df_featured['carrier_type'] = df_featured['Nummer.Spedition'].apply(classify_carrier)
print(f"   ✓ Carrier types:")
print(df_featured['carrier_type'].value_counts())

# ==============================================================================
# 3. Order Type Classification
# ==============================================================================
print("\n3. Classifying order types...")

def classify_order_type(row):
    """Classify order types based on Auftrags-art and Tilde"""
    auftragsart = str(row.get('Auftrags-art', '')).lower()
    tilde = str(row.get('Tilde.Auftrag', '')).lower()
    
    if 'leergut' in auftragsart:
        return 'Leergut'
    elif tilde == 'ja':
        return 'Pickup/Multi-leg'
    elif 'retoure' in auftragsart or 'abholung' in auftragsart:
        return 'Retoure/Abholung'
    else:
        return 'Delivery'

df_featured['order_type'] = df_featured.apply(classify_order_type, axis=1)
print(f"   ✓ Order types:")
print(df_featured['order_type'].value_counts())

# ==============================================================================
# 4. Betriebszentralen Mapping
# ==============================================================================
print("\n4. Mapping Betriebszentralen (14 dispatch centers)...")

try:
    betriebszentralen_df = pd.read_csv('../data/raw/TRAVECO_Betriebszentralen.csv')
    
    # Ensure types match
    df_featured['Nummer.Auftraggeber'] = pd.to_numeric(
        df_featured['Nummer.Auftraggeber'],
        errors='coerce'
    ).astype('Int64')
    
    betriebszentralen_df['Nummer.Auftraggeber'] = pd.to_numeric(
        betriebszentralen_df['Nummer.Auftraggeber'],
        errors='coerce'
    ).astype('Int64')
    
    # Create mapping with only needed columns
    bz_mapping = betriebszentralen_df[['Nummer.Auftraggeber', 'Name1']].copy()
    bz_mapping = bz_mapping.rename(columns={'Name1': 'betriebszentrale_name'})
    
    # Merge
    df_featured = df_featured.merge(
        bz_mapping,
        on='Nummer.Auftraggeber',
        how='left',
        suffixes=('', '_bz')  # Add suffix to avoid conflicts
    )
    
    # Check if merge worked
    if 'betriebszentrale_name' in df_featured.columns:
        # Fill missing
        df_featured['betriebszentrale_name'] = df_featured['betriebszentrale_name'].fillna('Unknown')
        
        print(f"   ✓ Betriebszentralen mapped successfully")
        print(f"   ✓ Distribution:")
        bz_counts = df_featured['betriebszentrale_name'].value_counts()
        for bz, count in bz_counts.head(15).items():
            print(f"      {bz}: {count:,}")
    else:
        raise ValueError("betriebszentrale_name column not created after merge")

except Exception as e:
    print(f"   ⚠️  Could not load Betriebszentralen mapping: {e}")
    print(f"      Creating 'Unknown' placeholder...")
    df_featured['betriebszentrale_name'] = 'Unknown'

# ==============================================================================
# 5. Sparten Mapping
# ==============================================================================
print("\n5. Mapping Sparten (customer divisions)...")

try:
    # Try loading Sparten from each year
    sparten_files = [
        '../data/raw/2024/2024 Sparten.xlsx',
        '../data/raw/2023/2023 Sparten.xlsx',
        '../data/raw/2022/2022 Sparten.xlsx'
    ]
    
    sparten_df = None
    for f in sparten_files:
        if os.path.exists(f):
            sparten_df = pd.read_excel(f)
            print(f"   ✓ Loaded Sparten from {os.path.basename(f)}")
            break
    
    if sparten_df is not None:
        # Assuming first column is customer number, second is Sparte
        customer_col = sparten_df.columns[0]
        sparte_col = sparten_df.columns[1]
        
        # Convert to Int64 for matching
        sparten_df[customer_col] = pd.to_numeric(
            sparten_df[customer_col],
            errors='coerce'
        ).astype('Int64')
        
        if 'RKdNr' in df_featured.columns:
            df_featured['RKdNr_numeric'] = pd.to_numeric(
                df_featured['RKdNr'].astype(str).str.replace('-', '').str.strip(),
                errors='coerce'
            ).astype('Int64')
            
            # Create clean mapping
            sparten_mapping = sparten_df[[customer_col, sparte_col]].copy()
            sparten_mapping = sparten_mapping.rename(columns={sparte_col: 'sparte'})
            
            # Merge
            df_featured = df_featured.merge(
                sparten_mapping,
                left_on='RKdNr_numeric',
                right_on=customer_col,
                how='left',
                suffixes=('', '_sparten')  # Add suffix to avoid conflicts
            )
            
            # Drop the merge key if it's different from our key
            if customer_col in df_featured.columns and customer_col != 'RKdNr_numeric':
                df_featured = df_featured.drop(columns=[customer_col])
            
            # Check if merge worked
            if 'sparte' in df_featured.columns:
                df_featured['sparte'] = df_featured['sparte'].fillna('Keine Sparte')
                
                print(f"   ✓ Sparten mapped successfully")
                print(f"   ✓ Top 10 Sparten:")
                sparten_counts = df_featured['sparte'].value_counts()
                for sparte, count in sparten_counts.head(10).items():
                    print(f"      {sparte}: {count:,}")
            else:
                raise ValueError("sparte column not created after merge")
        else:
            print(f"   ⚠️  RKdNr column not found")
            df_featured['sparte'] = 'Unknown'
    else:
        print(f"   ⚠️  No Sparten file found")
        df_featured['sparte'] = 'Unknown'

except Exception as e:
    print(f"   ⚠️  Could not map Sparten: {e}")
    print(f"      Creating 'Unknown' placeholder...")
    df_featured['sparte'] = 'Unknown'

# ==============================================================================
# Summary
# ==============================================================================
print(f"\n{'='*80}")
print(f"Feature Engineering Complete!")
print(f"  Final columns: {len(df_featured.columns)}")
print(f"  Final records: {len(df_featured):,}")
print(f"  Memory usage: {df_featured.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

Applying Feature Engineering...

1. Extracting temporal features...
   ✓ Added: year, month, quarter, week, day_of_year, weekday, is_weekend

2. Identifying carrier types...
   ✓ Carrier types:
carrier_type
internal    3726353
external    1123805
unknown       86938
Name: count, dtype: int64

3. Classifying order types...
   ✓ Order types:
order_type
Delivery            3448760
Leergut              905051
Pickup/Multi-leg     570874
Retoure/Abholung      12411
Name: count, dtype: int64

4. Mapping Betriebszentralen (14 dispatch centers)...
   ✓ Betriebszentralen mapped successfully
   ✓ Distribution:
      BZ Oberbipp: 1,450,642
      BZ Winterthur: 1,099,573
      BZ Sursee: 1,090,085
      BZ Landquart: 654,664
      BZ Herzogenbuchsee: 253,542
      B&T Winterthur: 190,812
      B&T Puidoux: 94,024
      BZ Sierre: 48,925
      BZ Puidoux: 24,740
      B&T Landquart: 17,084
      BZ Intermodal / Rail: 12,999
      Unknown: 6

5. Mapping Sparten (customer divisions)...
   ✓ Loaded Sp

## Section 6: Data Quality Validation

Final checks before saving.

In [11]:
print("Final Data Quality Validation")
print("="*80)

# Time coverage
print("\n1. Time Coverage:")
print(f"   Start: {df_featured['Datum.Tour'].min()}")
print(f"   End: {df_featured['Datum.Tour'].max()}")
print(f"   Total months: {df_featured['year_month'].nunique()}")

# Check for duplicates
print("\n2. Duplicate Check:")
if 'Auftragsschein-Nr.' in df_featured.columns:
    dupes = df_featured.duplicated(subset=['Auftragsschein-Nr.'], keep=False).sum()
    print(f"   Duplicate order IDs: {dupes:,}")
else:
    print("   ⚠️  Order ID column not found")

# Feature completeness
print("\n3. Feature Completeness:")
required_features = ['year', 'month', 'quarter', 'carrier_type', 'order_type', 
                     'betriebszentrale_name']
for feat in required_features:
    if feat in df_featured.columns:
        missing = int(df_featured[feat].isna().sum())  # Convert to int!
        pct = (missing / len(df_featured)) * 100
        print(f"   {feat}: {missing:,} missing ({pct:.2f}%)")
    else:
        print(f"   ❌ {feat}: NOT FOUND")

# Summary statistics
print("\n4. Summary Statistics:")
print(f"   Total orders: {len(df_featured):,}")
print(f"   Avg orders/month: {len(df_featured) / df_featured['year_month'].nunique():.0f}")
print(f"   Years covered: {sorted(df_featured['source_year'].unique())}")

Final Data Quality Validation

1. Time Coverage:
   Start: 2022-01-01 00:00:00
   End: 2024-12-31 00:00:00
   Total months: 36

2. Duplicate Check:
   ⚠️  Order ID column not found

3. Feature Completeness:
   year: 0 missing (0.00%)
   month: 0 missing (0.00%)
   quarter: 0 missing (0.00%)
   carrier_type: 0 missing (0.00%)
   order_type: 0 missing (0.00%)
   betriebszentrale_name: 0 missing (0.00%)

4. Summary Statistics:
   Total orders: 4,940,044
   Avg orders/month: 137223
   Years covered: [np.int64(2022), np.int64(2023), np.int64(2024)]


## Section 7: Save Consolidated Dataset

Save to processed folder in compressed format.

In [12]:
# ==============================================================================
# Section 7: Save Consolidated Dataset
# ==============================================================================
print("\n" + "="*80)
print("SAVING CONSOLIDATED DATASET...")
print("="*80)

from pathlib import Path
import os

# Create output directory
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# ==============================================================================
# 1. Save as Pickle (RECOMMENDED - handles all data types)
# ==============================================================================
output_pickle = output_dir / 'historic_orders_2022_2024.pkl'
print(f"\nSaving to {output_pickle}...")
df_featured.to_pickle(output_pickle)
file_size_mb = os.path.getsize(output_pickle) / 1024**2
print(f"✓ Saved Pickle: {file_size_mb:.1f} MB")

# ==============================================================================
# 2. Save as CSV.gz (backup - slower but portable)
# ==============================================================================
output_csv = output_dir / 'historic_orders_2022_2024.csv.gz'
print(f"\nSaving compressed CSV to {output_csv}...")
df_featured.to_csv(output_csv, index=False, compression='gzip')
file_size_mb = os.path.getsize(output_csv) / 1024**2
print(f"✓ Saved CSV.gz: {file_size_mb:.1f} MB")

# ==============================================================================
# 3. Try Parquet (optional - will skip if fails)
# ==============================================================================
print(f"\nAttempting Parquet save (optional)...")
try:
  # Convert ALL object columns to string for Parquet compatibility
  object_cols = df_featured.select_dtypes(include=['object']).columns
  df_parquet = df_featured.copy()

  for col in object_cols:
      df_parquet[col] = df_parquet[col].astype(str)

  output_parquet = output_dir / 'historic_orders_2022_2024.parquet'
  df_parquet.to_parquet(output_parquet, index=False, compression='snappy')
  file_size_mb = os.path.getsize(output_parquet) / 1024**2
  print(f"✓ Saved Parquet: {file_size_mb:.1f} MB")
except Exception as e:
  print(f"⚠️  Parquet save skipped: {e}")
  print("   (Pickle and CSV.gz versions are sufficient)")

# ==============================================================================
# Summary
# ==============================================================================
print(f"\n{'='*80}")
print(f"CONSOLIDATION COMPLETE!")
print(f"{'='*80}")
print(f"\nDataset ready for time series forecasting:")
print(f"  • Records: {len(df_featured):,}")
print(f"  • Time range: {df_featured['Datum.Tour'].min()} to {df_featured['Datum.Tour'].max()}")
print(f"  • Months: {df_featured['year'].nunique() * 12 if 'year' in df_featured.columns else 36}")
print(f"  • Features: {len(df_featured.columns)}")
print(f"  • Betriebszentralen: {df_featured['betriebszentrale_name'].nunique()}")
print(f"  • Carrier types: {df_featured['carrier_type'].nunique()}")
print(f"  • Order types: {df_featured['order_type'].nunique()}")

print(f"\nFiles saved:")
print(f"  ✓ Pickle:  {output_pickle} (FAST loading - recommended)")
print(f"  ✓ CSV.gz:  {output_csv} (Backup/portable)")
if 'output_parquet' in locals() and output_parquet.exists():
  print(f"  ✓ Parquet: {output_parquet} (Optional)")

print(f"\nNext step: Run notebook 08 for time series aggregation")
print(f"  Load command: df = pd.read_pickle('{output_pickle}')")


SAVING CONSOLIDATED DATASET...

Saving to ../data/processed/historic_orders_2022_2024.pkl...
✓ Saved Pickle: 3424.0 MB

Saving compressed CSV to ../data/processed/historic_orders_2022_2024.csv.gz...
✓ Saved CSV.gz: 421.4 MB

Attempting Parquet save (optional)...
⚠️  Parquet save skipped: Duplicate column names found: ['NummerKomplett.Auftrag', 'Nummer.Hauptauftrag', 'Nummer.Auftrag', 'Datum.Tour', 'Nummer.Tour', 'Tour Bezeichnung', 'Nummer.Auftraggeber', 'Id.Dispostelle', 'AuNr (formatiert)', 'AuNr (Original)', 'Auftrags-art', 'RKdNr', 'RKdArt', 'RKdName', 'RKdOrt', 'Nummer.Versender', 'Name.Versender', 'Versender Name 2', 'Strasse.Versender', 'Land.Versender', 'Ort.Versender', 'PLZ.Versender', 'Nummer.Empfänger', 'Name.Empfänger', 'Empfänger Name2', 'Land.Empfänger', 'Strasse.Empfänger', 'PLZ.Empfänger', 'Ort.Empfänger', 'Nummer.Beladestelle', 'Name.Beladestelle', 'Beladestelle Name 2', 'Land.Beladestelle', 'Strasse.Beladestelle', 'PLZ.Beladestelle', 'Ort.Beladestelle', 'Nummer.Entla

## Section 8: Quick Preview

Display sample records to verify data quality.

In [13]:
# ==============================================================================
# Section 8: Quick Preview & Reload Test
# ==============================================================================
print("\n" + "="*80)
print("TESTING RELOAD...")
print("="*80)

# Reload from Pickle
import pandas as pd
from pathlib import Path

pickle_path = Path('../data/processed/historic_orders_2022_2024.pkl')
print(f"\nReloading from {pickle_path}...")

import time
start = time.time()
df_reloaded = pd.read_pickle(pickle_path)
load_time = time.time() - start

print(f"✓ Loaded {len(df_reloaded):,} records in {load_time:.1f} seconds!")
print(f"  Columns: {len(df_reloaded.columns)}")
print(f"  Memory: {df_reloaded.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Sample records
print("\n" + "="*80)
print("Sample Records (first 5):")
print("="*80)
display_cols = ['Datum.Tour', 'year', 'month', 'betriebszentrale_name',
              'order_type', 'carrier_type']
available_cols = [col for col in display_cols if col in df_reloaded.columns]
print(df_reloaded[available_cols].head())

print("\n✅ NOTEBOOK 07 COMPLETE!")
print("You can now proceed to Notebook 08 for time series aggregation.")


TESTING RELOAD...

Reloading from ../data/processed/historic_orders_2022_2024.pkl...
✓ Loaded 4,940,044 records in 6.9 seconds!
  Columns: 123
  Memory: 30.20 GB

Sample Records (first 5):
  Datum.Tour  year  month betriebszentrale_name        order_type carrier_type
0 2022-01-01  2022      1          BZ Landquart  Pickup/Multi-leg     external
1 2022-01-01  2022      1          BZ Landquart  Pickup/Multi-leg     external
2 2022-01-01  2022      1          BZ Landquart  Pickup/Multi-leg     external
3 2022-01-01  2022      1          BZ Landquart  Pickup/Multi-leg     external
4 2022-01-01  2022      1          BZ Landquart  Pickup/Multi-leg     external

✅ NOTEBOOK 07 COMPLETE!
You can now proceed to Notebook 08 for time series aggregation.
