# Infleeting Lead Times — Compound Deep Dive

Analyzes actual days per infleeting phase (ordered → produced → arrived → tech prep → ready → in_subscription) by brand and compound.

**Data source**: `datawarehouse-304513.cars_ops_postgres_public.car_events`  
**API source**: `https://api-ops-cars.finn.auto` (ops-cars API, documented in `finn-auto/redocly`)

---

**⚠️ Prerequisites:**
- Run the setup cell below first (installs packages + authenticates)
- Requires FINN Google Cloud access (BigQuery permissions on `datawarehouse-304513`)

In [None]:
# =============================================================================
# SETUP: Run this cell first (required for Colab/web)
# =============================================================================
# Installs all dependencies and authenticates with Google Cloud

import subprocess
import sys

def install_if_missing(package, import_name=None):
    """Install package if not already available."""
    import_name = import_name or package
    try:
        __import__(import_name)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

# Required packages
install_if_missing("google-cloud-bigquery", "google.cloud.bigquery")
install_if_missing("pandas")
install_if_missing("plotly")
install_if_missing("db-dtypes")  # Required for BigQuery DATETIME/TIMESTAMP handling

# Authenticate in Colab environment
try:
    from google.colab import auth
    auth.authenticate_user()
    print("✓ Authenticated with Google Cloud (Colab)")
except ImportError:
    # Not in Colab — assumes local gcloud auth
    print("✓ Using local credentials (ensure 'gcloud auth application-default login' was run)")

print("✓ All dependencies installed")

In [1]:
# =============================================================================
# IMPORTS & BIGQUERY CLIENT
# =============================================================================

from google.cloud import bigquery
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Initialize BigQuery client
client = bigquery.Client(project='datawarehouse-304513')

def run_query(sql):
    """Execute SQL query and return results as DataFrame.
    
    Uses standard API (not Storage API) to avoid permission issues.
    """
    job = client.query(sql)
    results = job.result()  # Wait for query to complete
    return results.to_dataframe(create_bqstorage_client=False)  # Disable Storage API

print(f"✓ BigQuery client initialized (project: {client.project})")

## 1. Brand / Compound Configuration

Current compound assignments from the ops-cars lead times config:

In [2]:
COMPOUND_MAP = [
    ('Mazda', 'cat_zuelpich'), ('Ford', 'cat_zuelpich'), ('Polestar', 'cat_zuelpich'),
    ('Honda', 'akb_kitzingen'), ('Kia', 'blg_bremerhaven'), ('Kia', 'blg_saalanderdonau'),
    ('Nissan', 'blg_neuss'), ('Mazda', 'blg_duisburg'),
    ('Cadillac', 'ceva_grevenbroich'), ('Cadillac', 'ceva_bad_salzungen'),
    ('Hyundai', 'blg_bremerhaven'), ('Hyundai', 'mosolf_rackwitz'), ('Hyundai', 'mosolf_cuxhaven'),
    ('Jeep', 'mosolf_goessnitz'), ('Jeep', 'mosolf_kippenheim'), ('Jeep', 'mosolf_etzin'),
    ('Fiat', 'mosolf_kippenheim'), ('Fiat', 'mosolf_etzin'),
    ('Alfa Romeo', 'mosolf_kippenheim'),
    ('Mercedes-Benz', 'neuhaus_gmbh_via_bald'),
    ('MG', 'carserviceerkens_wachtendonk'), ('MG', 'mosolf_wilhelmshaven'),
    ('BYD', 'mosolf_wilhelmshaven'),
    ('BMW', 'akb_kitzingen'), ('MINI', 'akb_kitzingen'),
    ('Dacia', 'blg_duisburg'), ('Renault', 'blg_duisburg'),
    ('Peugeot', 'mosolf_kippenheim'), ('Peugeot', 'mosolf_etzin'), ('Peugeot', 'siebrecht_uslar'),
    ('Citroen', 'mosolf_kippenheim'), ('DS', 'mosolf_kippenheim'),
    ('Opel', 'mosolf_etzin'), ('Opel', 'mosolf_kippenheim'), ('Opel', 'siebrecht_uslar'),
    ('Toyota', 'blg_saalanderdonau'),
    ('Seat', 'akb_zoerbig'), ('Cupra', 'akb_zoerbig'),
    ('Skoda', 'mosolf_rackwitz'),
    ('Audi', 'akb_kitzingen'),
]

# Build SQL filter
filter_tuples = ', '.join(f"('{b}', '{c}')" for b, c in COMPOUND_MAP)
print(f'{len(COMPOUND_MAP)} brand/compound combinations configured')

40 brand/compound combinations configured


## 2. Compound Lead Times — Avg & Median per Phase

In [3]:
lead_times_query = f"""
WITH events_all AS (
  SELECT
    e.car_id, e.name, e.time,
    JSON_VALUE(e.snapshot, '$.oem') AS oem,
    JSON_VALUE(e.snapshot, '$.infleeting_compound_id') AS compound_id
  FROM `datawarehouse-304513.cars_ops_postgres_public.car_events` e
  WHERE e.name IN (
    'car_ordered','car_state_changed_produced','car_state_changed_arrived_from_supplier',
    'car_state_changed_tech_prep_done','car_state_changed_ready_to_deliver','car_state_changed_in_subscription'
  )
),
cars_active_2025 AS (
  SELECT DISTINCT car_id FROM events_all
  WHERE time >= '2025-01-01' AND time < '2026-01-01'
),
filtered AS (
  SELECT e.* FROM events_all e
  INNER JOIN cars_active_2025 c ON e.car_id = c.car_id
),
state_changes AS (
  SELECT car_id, name, time, oem, compound_id,
    LAG(time) OVER (PARTITION BY car_id ORDER BY time) AS prev_time
  FROM filtered
),
with_days AS (
  SELECT *, DATE_DIFF(DATE(time), DATE(prev_time), DAY) AS days_in_status
  FROM state_changes WHERE prev_time IS NOT NULL
),
compound_lookup AS (
  SELECT DISTINCT car_id, compound_id FROM filtered
  WHERE name = 'car_state_changed_arrived_from_supplier' AND compound_id IS NOT NULL
)
SELECT
  d.oem AS brand,
  COALESCE(cl.compound_id, d.compound_id) AS compound,
  COUNT(DISTINCT d.car_id) AS cars,

  ROUND(AVG(IF(d.name='car_state_changed_produced', d.days_in_status, NULL)),1) AS avg_ordered_to_produced,
  APPROX_QUANTILES(IF(d.name='car_state_changed_produced', d.days_in_status, NULL), 100 IGNORE NULLS)[OFFSET(50)] AS med_ordered_to_produced,

  ROUND(AVG(IF(d.name='car_state_changed_arrived_from_supplier', d.days_in_status, NULL)),1) AS avg_produced_to_arrived,
  APPROX_QUANTILES(IF(d.name='car_state_changed_arrived_from_supplier', d.days_in_status, NULL), 100 IGNORE NULLS)[OFFSET(50)] AS med_produced_to_arrived,

  ROUND(AVG(IF(d.name='car_state_changed_tech_prep_done', d.days_in_status, NULL)),1) AS avg_arrived_to_techprep,
  APPROX_QUANTILES(IF(d.name='car_state_changed_tech_prep_done', d.days_in_status, NULL), 100 IGNORE NULLS)[OFFSET(50)] AS med_arrived_to_techprep,

  ROUND(AVG(IF(d.name='car_state_changed_ready_to_deliver', d.days_in_status, NULL)),1) AS avg_techprep_to_ready,
  APPROX_QUANTILES(IF(d.name='car_state_changed_ready_to_deliver', d.days_in_status, NULL), 100 IGNORE NULLS)[OFFSET(50)] AS med_techprep_to_ready,

  ROUND(AVG(IF(d.name='car_state_changed_in_subscription', d.days_in_status, NULL)),1) AS avg_ready_to_sub,
  APPROX_QUANTILES(IF(d.name='car_state_changed_in_subscription', d.days_in_status, NULL), 100 IGNORE NULLS)[OFFSET(50)] AS med_ready_to_sub

FROM with_days d
LEFT JOIN compound_lookup cl ON d.car_id = cl.car_id
WHERE (d.oem, COALESCE(cl.compound_id, d.compound_id)) IN ({filter_tuples})
GROUP BY 1, 2
ORDER BY brand, compound
"""

df = run_query(lead_times_query)
print(f'{len(df)} brand/compound combos, {df["cars"].sum():,} total car transitions')
df

40 brand/compound combos, 48,549 total car transitions




Unnamed: 0,brand,compound,cars,avg_ordered_to_produced,med_ordered_to_produced,avg_produced_to_arrived,med_produced_to_arrived,avg_arrived_to_techprep,med_arrived_to_techprep,avg_techprep_to_ready,med_techprep_to_ready,avg_ready_to_sub,med_ready_to_sub
0,Alfa Romeo,mosolf_kippenheim,268,180.0,188,10.5,0,18.0,2,66.3,32,27.3,18
1,Audi,akb_kitzingen,2484,139.3,124,67.7,68,25.2,19,23.3,7,12.3,8
2,BMW,akb_kitzingen,4446,31.5,28,88.9,85,13.9,2,16.5,9,10.8,8
3,BYD,mosolf_wilhelmshaven,3078,14.9,0,34.3,29,4.4,0,5.9,0,35.9,29
4,Cadillac,ceva_bad_salzungen,18,0.0,0,11.1,11,7.9,0,31.0,18,10.7,7
5,Cadillac,ceva_grevenbroich,16,0.0,0,22.1,24,10.9,0,42.9,49,6.0,7
6,Citroen,mosolf_kippenheim,795,125.2,116,14.4,1,6.8,5,13.4,13,20.0,15
7,Cupra,akb_zoerbig,1628,67.8,64,53.0,42,9.1,0,25.8,19,13.6,9
8,DS,mosolf_kippenheim,150,89.0,69,14.2,14,10.7,5,19.4,25,47.6,36
9,Dacia,blg_duisburg,2051,14.6,0,52.5,62,19.1,21,15.2,9,18.9,14


## 3. Heatmap — Median Days per Phase

In [4]:
# Prepare heatmap data
df['label'] = df['brand'] + ' @ ' + df['compound'].str.replace('_', ' ')

phases = [
    ('med_ordered_to_produced', 'Ordered→Produced'),
    ('med_produced_to_arrived', 'Produced→Arrived'),
    ('med_arrived_to_techprep', 'Arrived→Tech Prep'),
    ('med_techprep_to_ready', 'Tech Prep→Ready'),
    ('med_ready_to_sub', 'Ready→Subscription'),
]

heat_df = df[['label', 'cars'] + [p[0] for p in phases]].copy()
heat_df.columns = ['label', 'cars'] + [p[1] for p in phases]

# Sort by total median days
phase_cols = [p[1] for p in phases]
heat_df['total'] = heat_df[phase_cols].sum(axis=1)
heat_df = heat_df.sort_values('total', ascending=True)

fig = px.imshow(
    heat_df[phase_cols].values,
    labels=dict(x='Phase', y='Brand @ Compound', color='Median Days'),
    x=phase_cols,
    y=heat_df['label'].values,
    color_continuous_scale='RdYlGn_r',
    aspect='auto',
    text_auto=True,
)
fig.update_layout(
    title='Infleeting Lead Times — Median Days per Phase (2025)',
    height=max(600, len(heat_df) * 22),
    width=900,
    font=dict(size=11),
)
fig.show()

## 4. Stacked Bar — Total Median Lead Time by Brand/Compound

In [5]:
# Stacked horizontal bar chart
bar_df = heat_df.sort_values('total', ascending=True).copy()

colors = {
    'Ordered→Produced': '#4e79a7',
    'Produced→Arrived': '#f28e2b',
    'Arrived→Tech Prep': '#e15759',
    'Tech Prep→Ready': '#76b7b2',
    'Ready→Subscription': '#59a14f',
}

fig = go.Figure()
for phase in phase_cols:
    fig.add_trace(go.Bar(
        y=bar_df['label'],
        x=bar_df[phase].fillna(0),
        name=phase,
        orientation='h',
        marker_color=colors[phase],
        text=bar_df[phase].fillna(0).astype(int),
        textposition='inside',
    ))

fig.update_layout(
    barmode='stack',
    title='Total Infleeting Lead Time — Median Days (2025)',
    xaxis_title='Days',
    height=max(600, len(bar_df) * 22),
    width=1000,
    legend=dict(orientation='h', yanchor='bottom', y=1.02),
    font=dict(size=11),
)
fig.show()

## 5. Avg vs Median Skew — Identifying Outlier-Heavy Combos

In [6]:
# Show where avg >> median (heavy right tail / outliers)
skew_data = []
for _, row in df.iterrows():
    for avg_col, med_col, phase_name in [
        ('avg_ordered_to_produced', 'med_ordered_to_produced', 'Ordered→Produced'),
        ('avg_produced_to_arrived', 'med_produced_to_arrived', 'Produced→Arrived'),
        ('avg_arrived_to_techprep', 'med_arrived_to_techprep', 'Arrived→Tech Prep'),
        ('avg_techprep_to_ready', 'med_techprep_to_ready', 'Tech Prep→Ready'),
        ('avg_ready_to_sub', 'med_ready_to_sub', 'Ready→Subscription'),
    ]:
        avg_val = row[avg_col]
        med_val = row[med_col]
        if pd.notna(avg_val) and pd.notna(med_val) and med_val > 0:
            skew_data.append({
                'brand': row['brand'],
                'compound': row['compound'],
                'phase': phase_name,
                'avg': avg_val,
                'median': med_val,
                'skew_ratio': round(avg_val / med_val, 1),
                'cars': row['cars'],
            })

skew_df = pd.DataFrame(skew_data)
print('Top 15 most skewed (avg/median ratio) — indicates outlier problems:')
skew_df.sort_values('skew_ratio', ascending=False).head(15)

Top 15 most skewed (avg/median ratio) — indicates outlier problems:


Unnamed: 0,brand,compound,phase,avg,median,skew_ratio,cars
120,Nissan,blg_neuss,Ordered→Produced,29.6,1,29.6,1240
61,Hyundai,mosolf_cuxhaven,Ordered→Produced,39.4,2,19.7,997
23,Citroen,mosolf_kippenheim,Produced→Arrived,14.4,1,14.4,795
1,Alfa Romeo,mosolf_kippenheim,Arrived→Tech Prep,18.0,2,9.0,268
11,BMW,akb_kitzingen,Arrived→Tech Prep,13.9,2,7.0,4446
99,MG,mosolf_wilhelmshaven,Tech Prep→Ready,13.7,2,6.8,3610
157,Polestar,cat_zuelpich,Tech Prep→Ready,29.8,5,6.0,583
81,Jeep,mosolf_kippenheim,Produced→Arrived,22.9,4,5.7,620
46,Fiat,mosolf_kippenheim,Tech Prep→Ready,159.8,39,4.1,55
138,Opel,siebrecht_uslar,Tech Prep→Ready,80.0,20,4.0,470


## 6. Single Car Deep Dive

In [7]:
def car_deep_dive(car_id: str):
    """Fetch and display the full infleeting timeline for a single car."""
    query = f"""
    WITH state_changes AS (
      SELECT
        car_id, name, time,
        JSON_VALUE(snapshot, '$.oem') AS oem,
        JSON_VALUE(snapshot, '$.model') AS model,
        JSON_VALUE(snapshot, '$.infleeting_compound_id') AS compound_id,
        JSON_VALUE(snapshot, '$.status') AS status,
        LAG(time) OVER (ORDER BY time) AS prev_time,
        LAG(name) OVER (ORDER BY time) AS prev_name
      FROM `datawarehouse-304513.cars_ops_postgres_public.car_events`
      WHERE car_id = '{car_id}'
        AND name IN (
          'car_ordered','car_state_changed_produced','car_state_changed_arrived_from_supplier',
          'car_state_changed_tech_prep_done','car_state_changed_ready_to_deliver','car_state_changed_in_subscription'
        )
    )
    SELECT
      name AS event,
      time,
      oem, model, compound_id, status,
      DATE_DIFF(DATE(time), DATE(prev_time), DAY) AS days_in_prev_status
    FROM state_changes
    ORDER BY time
    """
    result = run_query(query)
    if len(result) == 0:
        print(f'No state change events found for car {car_id}')
        return None

    oem = result.iloc[0]['oem']
    model = result.iloc[0]['model']
    compound = result.iloc[-1]['compound_id']
    print(f'Car {car_id}: {oem} {model} @ {compound}')
    print(f'Total: {(result["time"].max() - result["time"].min()).days}d from ordered to {result.iloc[-1]["status"]}')
    print()
    return result[['event', 'time', 'days_in_prev_status', 'status']]

# Example: the Nissan Qashqai we analyzed earlier
car_deep_dive('zxmotnpu')

Car zxmotnpu: Nissan Qashqai @ blg_neuss
Total: 45d from ordered to in_subscription




BigQuery Storage module not found, fetch data with the REST endpoint instead.



Unnamed: 0,event,time,days_in_prev_status,status
0,car_ordered,2025-12-30 07:17:55.642000+00:00,,ordered
1,car_state_changed_produced,2025-12-30 07:17:55.869000+00:00,0.0,produced
2,car_state_changed_arrived_from_supplier,2026-01-07 06:18:54.461000+00:00,8.0,arrived_from_supplier
3,car_state_changed_tech_prep_done,2026-01-26 08:20:47.248000+00:00,19.0,tech_preparation_done
4,car_state_changed_ready_to_deliver,2026-02-02 08:00:55.471000+00:00,7.0,ready_to_deliver
5,car_state_changed_in_subscription,2026-02-13 09:53:59.273000+00:00,11.0,in_subscription


## 7. Monthly Trend — Fleet-Wide Throughput

In [8]:
trend_query = """
SELECT
  FORMAT_DATE('%Y-%m', DATE(time)) AS month,
  CASE name
    WHEN 'car_state_changed_produced' THEN 'Produced'
    WHEN 'car_state_changed_arrived_from_supplier' THEN 'Arrived'
    WHEN 'car_state_changed_tech_prep_done' THEN 'Tech Prep Done'
    WHEN 'car_state_changed_ready_to_deliver' THEN 'Ready to Deliver'
    WHEN 'car_state_changed_in_subscription' THEN 'In Subscription'
  END AS status,
  COUNT(DISTINCT car_id) AS cars
FROM `datawarehouse-304513.cars_ops_postgres_public.car_events`
WHERE name IN (
  'car_state_changed_produced','car_state_changed_arrived_from_supplier',
  'car_state_changed_tech_prep_done','car_state_changed_ready_to_deliver',
  'car_state_changed_in_subscription'
)
AND time >= '2025-01-01' AND time < '2026-01-01'
GROUP BY 1, 2
ORDER BY 1, 2
"""

trend_df = run_query(trend_query)

fig = px.line(
    trend_df, x='month', y='cars', color='status',
    title='Monthly Cars Reaching Each Infleeting Phase (2025)',
    labels={'cars': 'Cars', 'month': 'Month'},
    markers=True,
)
fig.update_layout(height=450, width=900, legend=dict(orientation='h', yanchor='bottom', y=1.02))
fig.show()


BigQuery Storage module not found, fetch data with the REST endpoint instead.



## 8. Bottleneck Detector — Worst Phases per Compound

In [9]:
# Find the worst phase for each brand/compound (highest median)
bottlenecks = []
for _, row in df.iterrows():
    worst_phase = None
    worst_val = 0
    for med_col, phase_name in [
        ('med_ordered_to_produced', 'Ordered→Produced'),
        ('med_produced_to_arrived', 'Produced→Arrived'),
        ('med_arrived_to_techprep', 'Arrived→Tech Prep'),
        ('med_techprep_to_ready', 'Tech Prep→Ready'),
        ('med_ready_to_sub', 'Ready→Subscription'),
    ]:
        val = row[med_col]
        if pd.notna(val) and val > worst_val:
            worst_val = val
            worst_phase = phase_name
    bottlenecks.append({
        'brand': row['brand'],
        'compound': row['compound'],
        'cars': row['cars'],
        'bottleneck_phase': worst_phase,
        'median_days': worst_val,
    })

bn_df = pd.DataFrame(bottlenecks).sort_values('median_days', ascending=False)

fig = px.bar(
    bn_df.head(20),
    x='median_days',
    y=bn_df.head(20).apply(lambda r: f"{r['brand']} @ {r['compound']}", axis=1),
    color='bottleneck_phase',
    orientation='h',
    title='Top 20 Bottlenecks — Worst Phase per Brand/Compound (Median Days)',
    labels={'median_days': 'Median Days', 'y': ''},
)
fig.update_layout(height=550, width=900, yaxis=dict(autorange='reversed'))
fig.show()

## 9. Expected vs. Actual Lead Times — Backtesting Accuracy

Compare configured lead times (from ops reference table) against actual measured medians.

**Note**: Many OEMs have conditional lead times (paid/unpaid, SA number, supplier status). This analysis uses the "standard" or "worst case" values where conditions apply. Conditional segmentation requires additional snapshot fields.

In [10]:
# Expected lead times from ops reference table (in calendar days)
# For conditional values, using the longer/unpaid/no-SA estimate as baseline
# Working days converted: 1 week = 7 days, 10 working days ≈ 14 days, 5 working days ≈ 7 days, 3 working days ≈ 5 days

EXPECTED_LEAD_TIMES = [
    # (brand, compound, ordered→produced, produced→arrived, arrived→techprep, techprep→ready, ready→sub, notes)
    ('Mazda', 'cat_zuelpich', 35, 35, 28, 28, 28, 'Unpaid path; paid is faster'),
    ('Ford', 'cat_zuelpich', 35, 35, 21, 14, 5, ''),
    ('Polestar', 'cat_zuelpich', 35, 35, 21, 14, 5, ''),
    ('Honda', 'akb_kitzingen', 21, 21, 14, 14, 7, 'Outbound check not done path'),
    ('Kia', 'blg_bremerhaven', 56, 42, 21, 14, 7, ''),
    ('Kia', 'blg_saalanderdonau', 56, 42, 21, 14, 7, ''),
    ('Nissan', 'blg_neuss', 35, 21, 21, 14, 7, ''),
    ('Mazda', 'blg_duisburg', None, None, None, None, None, 'Only 1 car, no reference data'),
    ('Cadillac', 'ceva_grevenbroich', 21, 21, 20, 14, 7, 'Outbound check not done path'),
    ('Cadillac', 'ceva_bad_salzungen', 21, 21, 20, 14, 7, 'Outbound check not done path'),
    ('Hyundai', 'blg_bremerhaven', 42, 42, 35, 14, 7, ''),
    ('Hyundai', 'mosolf_rackwitz', 42, 42, 21, 14, 5, ''),
    ('Hyundai', 'mosolf_cuxhaven', 42, 42, 21, 14, 7, ''),
    ('Jeep', 'mosolf_goessnitz', 21, 21, 21, 14, 9, 'No SA number path'),
    ('Jeep', 'mosolf_kippenheim', 21, 21, 21, 14, 9, 'No SA number path'),
    ('Jeep', 'mosolf_etzin', 21, 21, 21, 14, 9, 'No SA number path'),
    ('Fiat', 'mosolf_kippenheim', 21, 21, 21, 14, 9, 'No SA number path'),
    ('Fiat', 'mosolf_etzin', 21, 21, 21, 14, 9, 'No SA number path'),
    ('Alfa Romeo', 'mosolf_kippenheim', 21, 21, 21, 14, 9, 'No SA number path'),
    ('Mercedes-Benz', 'neuhaus_gmbh_via_bald', 28, 28, 14, 14, 7, ''),
    ('MG', 'carserviceerkens_wachtendonk', 21, 21, 14, 14, 7, 'Often skips tech_prep'),
    ('MG', 'mosolf_wilhelmshaven', 21, 21, 21, 14, 9, 'Often skips tech_prep; no SA path'),
    ('BYD', 'mosolf_wilhelmshaven', 35, 35, 21, 14, 14, 'Often skips tech_prep; no SA + no release path'),
    ('BMW', 'akb_kitzingen', 70, 70, 14, 14, 7, 'Supplier status <170 path; often skips tech_prep'),
    ('MINI', 'akb_kitzingen', 70, 70, 14, 14, 7, 'Supplier status <170 path; often skips tech_prep'),
    ('Dacia', 'blg_duisburg', 35, 35, 35, 14, 14, 'Unpaid path'),
    ('Renault', 'blg_duisburg', 35, 35, 35, 14, 14, 'Unpaid path'),
    ('Peugeot', 'mosolf_kippenheim', 28, 28, 21, 14, 9, 'No SA number path'),
    ('Peugeot', 'mosolf_etzin', 28, 28, 21, 14, 9, 'No SA number path'),
    ('Peugeot', 'siebrecht_uslar', None, None, None, None, None, 'Only 2 cars, no reference data'),
    ('Citroen', 'mosolf_kippenheim', 28, 28, 21, 14, 9, 'No SA number path'),
    ('DS', 'mosolf_kippenheim', 28, 28, 21, 14, 9, 'No SA number path'),
    ('Opel', 'mosolf_etzin', 28, 28, 21, 14, 9, 'No SA number path'),
    ('Opel', 'mosolf_kippenheim', 28, 28, 21, 14, 9, 'No SA number path'),
    ('Opel', 'siebrecht_uslar', None, None, None, None, None, 'Only 9 cars, no reference data'),
    ('Toyota', 'blg_saalanderdonau', 28, 28, 21, 14, 7, ''),
    ('Seat', 'akb_zoerbig', 49, 49, 35, 28, 7, 'Unpaid path; outbound check not done'),
    ('Cupra', 'akb_zoerbig', 49, 49, 35, 28, 7, 'Unpaid path; outbound check not done'),
    ('Skoda', 'mosolf_rackwitz', 49, 49, 35, 28, 5, 'Unpaid path'),
    ('Audi', 'akb_kitzingen', 49, 49, 35, 28, 7, 'Unpaid path; outbound check not done'),
]

expected_df = pd.DataFrame(EXPECTED_LEAD_TIMES, columns=[
    'brand', 'compound', 
    'exp_ordered_to_produced', 'exp_produced_to_arrived', 
    'exp_arrived_to_techprep', 'exp_techprep_to_ready', 'exp_ready_to_sub',
    'notes'
])

print(f'{len(expected_df)} brand/compound combinations with expected lead times')
expected_df[expected_df['exp_ordered_to_produced'].notna()].head(10)

40 brand/compound combinations with expected lead times


Unnamed: 0,brand,compound,exp_ordered_to_produced,exp_produced_to_arrived,exp_arrived_to_techprep,exp_techprep_to_ready,exp_ready_to_sub,notes
0,Mazda,cat_zuelpich,35.0,35.0,28.0,28.0,28.0,Unpaid path; paid is faster
1,Ford,cat_zuelpich,35.0,35.0,21.0,14.0,5.0,
2,Polestar,cat_zuelpich,35.0,35.0,21.0,14.0,5.0,
3,Honda,akb_kitzingen,21.0,21.0,14.0,14.0,7.0,Outbound check not done path
4,Kia,blg_bremerhaven,56.0,42.0,21.0,14.0,7.0,
5,Kia,blg_saalanderdonau,56.0,42.0,21.0,14.0,7.0,
6,Nissan,blg_neuss,35.0,21.0,21.0,14.0,7.0,
8,Cadillac,ceva_grevenbroich,21.0,21.0,20.0,14.0,7.0,Outbound check not done path
9,Cadillac,ceva_bad_salzungen,21.0,21.0,20.0,14.0,7.0,Outbound check not done path
10,Hyundai,blg_bremerhaven,42.0,42.0,35.0,14.0,7.0,


### 9.1 Accuracy Analysis — Expected vs. Actual Medians

In [11]:
# Merge actual medians with expected values
comparison = df.merge(expected_df, on=['brand', 'compound'], how='left')

# Calculate deltas (actual - expected) for each phase
phase_mappings = [
    ('med_ordered_to_produced', 'exp_ordered_to_produced', 'Ordered→Produced'),
    ('med_produced_to_arrived', 'exp_produced_to_arrived', 'Produced→Arrived'),
    ('med_arrived_to_techprep', 'exp_arrived_to_techprep', 'Arrived→Tech Prep'),
    ('med_techprep_to_ready', 'exp_techprep_to_ready', 'Tech Prep→Ready'),
    ('med_ready_to_sub', 'exp_ready_to_sub', 'Ready→Subscription'),
]

accuracy_rows = []
for _, row in comparison.iterrows():
    if pd.isna(row['exp_ordered_to_produced']):
        continue  # Skip rows without expected data
    
    for actual_col, expected_col, phase_name in phase_mappings:
        actual = row[actual_col]
        expected = row[expected_col]
        if pd.notna(actual) and pd.notna(expected) and expected > 0:
            delta = actual - expected
            pct_error = (delta / expected) * 100
            accuracy_rows.append({
                'brand': row['brand'],
                'compound': row['compound'],
                'phase': phase_name,
                'expected_days': expected,
                'actual_median': actual,
                'delta_days': delta,
                'pct_error': round(pct_error, 1),
                'cars': row['cars'],
                'notes': row['notes'],
            })

accuracy_df = pd.DataFrame(accuracy_rows)

# Summary stats
print('=== Prediction Accuracy Summary ===\n')
print(f'Total comparisons: {len(accuracy_df)}')
print(f'Mean absolute error: {accuracy_df["delta_days"].abs().mean():.1f} days')
print(f'Mean % error: {accuracy_df["pct_error"].mean():.1f}%')
print(f'\nAccurate (within ±20%): {(accuracy_df["pct_error"].abs() <= 20).sum()} / {len(accuracy_df)} ({(accuracy_df["pct_error"].abs() <= 20).mean()*100:.0f}%)')
print(f'Overestimated (actual < expected): {(accuracy_df["delta_days"] < 0).sum()}')
print(f'Underestimated (actual > expected): {(accuracy_df["delta_days"] > 0).sum()}')

=== Prediction Accuracy Summary ===

Total comparisons: 185
Mean absolute error: 20.7 days
Mean % error: 41.0%

Accurate (within ±20%): 31 / 185 (17%)
Overestimated (actual < expected): 92
Underestimated (actual > expected): 88


### 9.2 Worst Prediction Misses — Where We're Most Wrong

In [12]:
# Show biggest misses (sorted by absolute delta)
print('Top 20 Prediction Misses (by absolute days off):\n')
worst_misses = accuracy_df.sort_values('delta_days', key=abs, ascending=False).head(20)
worst_misses[['brand', 'compound', 'phase', 'expected_days', 'actual_median', 'delta_days', 'pct_error', 'cars']]

Top 20 Prediction Misses (by absolute days off):



Unnamed: 0,brand,compound,phase,expected_days,actual_median,delta_days,pct_error,cars
50,Fiat,mosolf_etzin,Ordered→Produced,21.0,256,235.0,1119.0,237
85,Jeep,mosolf_etzin,Ordered→Produced,21.0,202,181.0,861.9,442
0,Alfa Romeo,mosolf_kippenheim,Ordered→Produced,21.0,188,167.0,795.2,268
150,Peugeot,mosolf_etzin,Ordered→Produced,28.0,140,112.0,400.0,340
30,Citroen,mosolf_kippenheim,Ordered→Produced,28.0,116,88.0,314.3,795
162,Polestar,cat_zuelpich,Arrived→Tech Prep,21.0,99,78.0,371.4,583
59,Fiat,mosolf_kippenheim,Ready→Subscription,9.0,85,76.0,844.4,55
5,Audi,akb_kitzingen,Ordered→Produced,49.0,124,75.0,153.1,2484
98,Jeep,mosolf_kippenheim,Tech Prep→Ready,14.0,78,64.0,457.1,620
94,Jeep,mosolf_goessnitz,Ready→Subscription,9.0,65,56.0,622.2,270


### 9.3 Accuracy Heatmap — Expected vs. Actual by Brand/Compound

In [13]:
# Pivot for heatmap: rows = brand/compound, cols = phase, values = delta_days
accuracy_pivot = accuracy_df.pivot_table(
    index=['brand', 'compound'], 
    columns='phase', 
    values='delta_days',
    aggfunc='first'
)

# Reorder columns
phase_order = ['Ordered→Produced', 'Produced→Arrived', 'Arrived→Tech Prep', 'Tech Prep→Ready', 'Ready→Subscription']
accuracy_pivot = accuracy_pivot[[p for p in phase_order if p in accuracy_pivot.columns]]

# Create labels
accuracy_pivot_labels = [f'{b} @ {c}' for b, c in accuracy_pivot.index]

fig = px.imshow(
    accuracy_pivot.values,
    labels=dict(x='Phase', y='Brand @ Compound', color='Delta (days)'),
    x=accuracy_pivot.columns.tolist(),
    y=accuracy_pivot_labels,
    color_continuous_scale='RdBu_r',  # Red = actual > expected (slower), Blue = actual < expected (faster)
    color_continuous_midpoint=0,
    aspect='auto',
    text_auto=True,
)
fig.update_layout(
    title='Prediction Accuracy: Actual − Expected (days)<br><sub>Red = slower than predicted, Blue = faster than predicted</sub>',
    height=max(600, len(accuracy_pivot) * 22),
    width=950,
    font=dict(size=11),
)
fig.show()

### 9.4 Accuracy by Phase — Which Phases Are Hardest to Predict?

In [14]:
# Aggregate accuracy by phase
phase_accuracy = accuracy_df.groupby('phase').agg({
    'delta_days': ['mean', 'std', lambda x: x.abs().mean()],
    'pct_error': ['mean', lambda x: x.abs().mean()],
}).round(1)
phase_accuracy.columns = ['mean_delta', 'std_delta', 'mae_days', 'mean_pct_error', 'mape']

# Reorder
phase_accuracy = phase_accuracy.reindex([p for p in phase_order if p in phase_accuracy.index])

print('Accuracy by Phase:\n')
print('MAE = Mean Absolute Error (days)')
print('MAPE = Mean Absolute Percentage Error\n')
phase_accuracy

Accuracy by Phase:

MAE = Mean Absolute Error (days)
MAPE = Mean Absolute Percentage Error



Unnamed: 0_level_0,mean_delta,std_delta,mae_days,mean_pct_error,mape
phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ordered→Produced,24.1,65.7,48.3,109.7,171.5
Produced→Arrived,-8.0,18.4,16.2,-25.2,54.3
Arrived→Tech Prep,-9.5,17.6,15.3,-42.9,70.2
Tech Prep→Ready,4.8,17.0,12.6,42.5,81.9
Ready→Subscription,9.6,16.8,10.9,120.8,126.2


### 9.5 Recommendations — Where to Update Predictions

In [15]:
# Flag combinations where predictions need updating (>30% off and significant volume)
significant_misses = accuracy_df[
    (accuracy_df['pct_error'].abs() > 30) & 
    (accuracy_df['cars'] >= 20)
].sort_values('delta_days', key=abs, ascending=False)

print(f'Combinations needing prediction updates (>30% error, ≥20 cars): {len(significant_misses)}\n')

if len(significant_misses) > 0:
    recs = significant_misses[['brand', 'compound', 'phase', 'expected_days', 'actual_median', 'delta_days', 'pct_error', 'cars', 'notes']].copy()
    recs['recommendation'] = recs.apply(
        lambda r: f"Change {int(r['expected_days'])}d → {int(r['actual_median'])}d", axis=1
    )
    recs
else:
    print('All predictions within acceptable range!')

Combinations needing prediction updates (>30% error, ≥20 cars): 130

