In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('cleaned_data.csv', delimiter=',')

In [None]:
date_columns = ['DT_COMMERCIAL_ORDER_FIRST_ENTRY_DATE', 'DT_VEHICLE_FACTORY_PRODUCTION_DATE',
               'DT_VEHICLE_PASSED_TO_SALES_DATE', 'DT_READY_TO_SHIP_FROM_LOGISTIC_PLANT_COMPOUND_DATE',
               'DT_SHIPPING_ORDER_TO_NSC_COMPOUND_CREATION_DATE', 'DT_CUSTOMS_OFFICE_INBOUND_DATE',
               'DT_CUSTOMS_OFFICE_OUTBOUND_DATE', 'DT_ARRIVAL_AT_DESTINATION_BY_TRANSPORTER_DATE',
               'DT_EXPECTED_DELIVERY_TO_FINAL_CUSTOMER_DATE']

In [None]:
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [None]:
df['ORDER_YEAR'] = df['DT_COMMERCIAL_ORDER_FIRST_ENTRY_DATE'].dt.year
df['ORDER_MONTH'] = df['DT_COMMERCIAL_ORDER_FIRST_ENTRY_DATE'].dt.month
df['ORDER_QUARTER'] = df['DT_COMMERCIAL_ORDER_FIRST_ENTRY_DATE'].dt.quarter
df['ORDER_WEEK'] = df['DT_COMMERCIAL_ORDER_FIRST_ENTRY_DATE'].dt.isocalendar().week
df['ORDER_DAY_OF_YEAR'] = df['DT_COMMERCIAL_ORDER_FIRST_ENTRY_DATE'].dt.dayofyear

### === TEMPORAL PATTERNS DEEP EDA ===

In [None]:
print("1. DELAY TRENDS ANALYSIS")
print("=" * 30)

In [None]:
monthly_delays = df.groupby(['ORDER_YEAR', 'ORDER_MONTH']).agg({
    'Is_Delayed': ['count', 'sum', 'mean'],
    'REG_Delay_Days': ['mean', 'median', 'std']
}).round(3)

In [None]:
monthly_delays.columns

In [None]:
print("\nMonthly Delay Statistics:")
monthly_delays

In [None]:
# Seasonal patterns
seasonal_delays = df.groupby('ORDER_QUARTER').agg({
    'Is_Delayed': 'mean',
    'REG_Delay_Days': 'mean',
    'TOTAL_LEAD_VS_EXPECTED_DAYS': 'mean'
}).round(3)

In [None]:
print(f"\nSeasonal Delay Patterns:")
seasonal_delays

In [None]:
dow_delays = df.groupby('ORDER_DAY_OF_WEEK').agg({
    'Is_Delayed': ['count', 'mean'],
    'REG_Delay_Days': 'mean'
}).round(3)
dow_delays.columns = ['Orders_Count', 'Delay_Rate', 'Avg_Delay_Days']

In [None]:
print(f"\nDay of Week Patterns:")
dow_delays

In [None]:
# 2. SUPPLY CHAIN STAGE ANALYSIS
print("\n\n2. SUPPLY CHAIN STAGE TEMPORAL ANALYSIS")
print("=" * 40)

In [None]:
stage_delays = df.agg({
    'ORDER_TO_PRODUCTION_DAYS': ['mean', 'median', 'std'],
    'PRODUCTION_TO_SHIPPING_READY_DAYS': ['mean', 'median', 'std'],
    'SHIPPING_READY_TO_SHIPPING_DAYS': ['mean', 'median', 'std'],
    'SHIPPING_TRANSIT_DAYS': ['mean', 'median', 'std'],
    'CUSTOMS_CLEARANCE_DAYS': ['mean', 'median', 'std']
}).round(2)

In [None]:
stage_delays

In [None]:
print("Supply Chain Stage Duration Analysis:")
stage_delays.T

In [None]:
monthly_stages = df.groupby(['ORDER_YEAR', 'ORDER_MONTH']).agg({
    'ORDER_TO_PRODUCTION_DAYS': 'mean',
    'PRODUCTION_TO_SHIPPING_READY_DAYS': 'mean', 
    'SHIPPING_READY_TO_SHIPPING_DAYS': 'mean',
    'SHIPPING_TRANSIT_DAYS': 'mean'
}).round(2)

In [None]:
print(f"\nMonthly Stage Performance (last 5 months):")
monthly_stages.tail()

In [None]:
# 3. BRAND × MARKET TEMPORAL PATTERNS
print("\n\n3. BRAND × MARKET TEMPORAL ANALYSIS")
print("=" * 35)

In [None]:
brand_temporal = df.groupby(['GN_BRAND_ORIGINAL_NAME', 'ORDER_QUARTER']).agg({
    'Is_Delayed': ['count', 'mean'],
    'REG_Delay_Days': 'mean'
}).round(3)

In [None]:
print("Brand Performance by Quarter:")
for brand in df['GN_BRAND_ORIGINAL_NAME'].unique()[:3]:  # Top 3 brands
    brand_data = brand_temporal.loc[brand]
    brand_data.columns = ['Orders', 'Delay_Rate', 'Avg_Delay_Days']
    print(f"\n{brand}:")
    print(brand_data)

In [None]:
market_temporal = df.groupby(['CD_MARKET_CODE', 'ORDER_QUARTER']).agg({
    'Is_Delayed': ['count', 'mean']
}).round(3)

In [None]:
top_markets = df['CD_MARKET_CODE'].value_counts().head(3).index

In [None]:
print(f"\nTop 3 Markets Quarterly Performance:")
for market in top_markets:
    try:
        market_data = market_temporal.loc[market]
        market_data.columns = ['Orders', 'Delay_Rate']
        print(f"\nMarket {market}:")
        print(market_data)
    except KeyError:
        continue

In [None]:
print("\n\n4. WEEKLY VOLUME PATTERNS")
print("=" * 25)

In [None]:
df['ORDER_WEEK_DATE'] = df['DT_COMMERCIAL_ORDER_FIRST_ENTRY_DATE'].dt.to_period('W').dt.start_time

In [None]:
weekly_volumes = df.groupby(['ORDER_WEEK_DATE', 'GN_BRAND_ORIGINAL_NAME']).agg({
    'CD_COMMERCIAL_ORDER_CODE': 'count',
    'Is_Delayed': 'mean'
}).reset_index()

In [None]:
weekly_volumes.columns = ['Week', 'Brand', 'Order_Count', 'Delay_Rate']

In [None]:
print("Weekly Volume Statistics by Brand:")
weekly_summary = weekly_volumes.groupby('Brand').agg({
    'Order_Count': ['mean', 'std', 'min', 'max'],
    'Delay_Rate': 'mean'
}).round(2)

In [None]:
weekly_summary

In [None]:
# 5. CORRELATION WITH EXTERNAL FACTORS
print("\n\n5. EXTERNAL FACTORS CORRELATION")
print("=" * 30)

In [None]:
intl_patterns = df.groupby(['International_Status', 'ORDER_QUARTER']).agg({
    'Is_Delayed': 'mean',
    'REG_Delay_Days': 'mean',
    'SHIPPING_TRANSIT_DAYS': 'mean'
}).round(3)

In [None]:
print("International vs Domestic Patterns by Quarter:")
intl_patterns

In [None]:
# Plant performance over time
plant_temporal = df.groupby(['CD_PLANT_LOGISTIC_CODE', 'ORDER_QUARTER']).agg({
    'Is_Delayed': ['count', 'mean'],
    'ORDER_TO_PRODUCTION_DAYS': 'mean'
}).round(3)

In [None]:
top_plants = df['CD_PLANT_LOGISTIC_CODE'].value_counts().head(3).index
print(f"\nTop 3 Plants Quarterly Performance:")
for plant in top_plants:
    try:
        plant_data = plant_temporal.loc[plant]
        plant_data.columns = ['Orders', 'Delay_Rate', 'Avg_Production_Days']
        print(f"\nPlant {plant}:")
        print(plant_data)
    except KeyError:
        continue

In [None]:
# 6. KEY INSIGHTS SUMMARY
print("\n\n6. KEY TEMPORAL INSIGHTS")
print("=" * 25)


In [None]:
delay_trend = df.groupby(['ORDER_YEAR', 'ORDER_MONTH'])['Is_Delayed'].mean()
recent_trend = delay_trend.tail(6)
print(f"Recent 6-month delay trend:")
print(recent_trend)

In [None]:
# Volatility analysis
monthly_volatility = df.groupby(['ORDER_YEAR', 'ORDER_MONTH']).agg({
    'REG_Delay_Days': 'std',
    'TOTAL_LEAD_VS_EXPECTED_DAYS': 'std'
}).mean()

In [None]:
print(f"\nDelay Volatility (Monthly Std Dev):")
print(f"Registration Delay Days: {monthly_volatility['REG_Delay_Days']:.2f}")
print(f"Total Lead Time vs Expected: {monthly_volatility['TOTAL_LEAD_VS_EXPECTED_DAYS']:.2f}")

In [None]:
missing_temporal = df.groupby(['ORDER_YEAR', 'ORDER_MONTH']).agg({
    'MISSING_PRODUCTION_DATE_FLAG': 'mean',
    'MISSING_READY_TO_SHIP': 'mean',
    'MISSING_CUSTOMS_INBOUND': 'mean'
}).round(3)

In [None]:
print(f"\nMissing Data Patterns Over Time (last 3 months):")
missing_temporal.tail(3)

In [None]:
print("\n=== ANALYSIS COMPLETE ===")
print("\nNext Steps:")
print("1. Create lag features (7, 14, 30 day rolling averages)")
print("2. Engineer seasonal decomposition features")
print("3. Build plant/brand-specific delay patterns")
print("4. Incorporate external data (weather, holidays)")
print("5. Develop time-aware train/validation splits")