# Simplified AutoML Training - Late Delivery Prediction POC

**Goal:** Train a regression model to predict AGE_REQ_DATE (days late/early) using AutoML.

This streamlined version focuses on essential steps:
1. Load data from semantic model
2. Engineer key features
3. Train AutoML model
4. Evaluate performance
5. Register to MLflow

**Target:** AGE_REQ_DATE (days late vs Customer Requested Delivery Date)
- Positive = Late
- Negative = Early
- Zero = On-time

## 1Ô∏è‚É£ Imports & Configuration

In [None]:
# Core libraries
import pandas as pd
import numpy as np
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import sempy.fabric as fabric
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

print("‚úÖ Imports complete")

## 2Ô∏è‚É£ Configuration

In [None]:
# Configuration
DATASET = "DLV Aging Columns & Measures"
MODEL_NAME = "POC-LateDelivery-Regression-AutoML"
SAMPLE_SIZE = 50000  # Adjust based on capacity (20K-100K)

# AutoML settings
TIME_BUDGET = 180  # seconds (3 minutes)
TARGET_COLUMN = "AGE_REQ_DATE"

print(f"üìä Dataset: {DATASET}")
print(f"üéØ Target: {TARGET_COLUMN}")
print(f"üìè Sample Size: {SAMPLE_SIZE:,} rows")
print(f"‚è±Ô∏è  Training Budget: {TIME_BUDGET} seconds")

## 3Ô∏è‚É£ Load Data from Semantic Model

In [None]:
# Load closed deliveries (GI Date is not blank)
dax_query = f"""
EVALUATE
TOPN({SAMPLE_SIZE}, Aging, Aging[Delivery Created On], DESC)
WHERE NOT(ISBLANK(Aging[GI Date]))
"""

print("üì• Loading data from semantic model...")
df = fabric.evaluate_dax(dataset=DATASET, dax_string=dax_query)

# Clean column names
df.columns = [col.split('[')[-1].replace(']', '') if '[' in col else col for col in df.columns]

print(f"‚úÖ Loaded {len(df):,} closed deliveries")
print(f"‚úÖ Columns: {df.shape[1]}")
print(f"\nFirst 3 rows:")
df.head(3)

## 4Ô∏è‚É£ Feature Engineering

In [None]:
# Create temporal features from Delivery Created On
df['created_date'] = pd.to_datetime(df['Delivery Created On'])
df['created_dayofweek'] = df['created_date'].dt.dayofweek  # 0=Monday, 6=Sunday
df['created_month'] = df['created_date'].dt.month
df['days_since_creation'] = (pd.Timestamp.now() - df['created_date']).dt.days

print("‚úÖ Created temporal features:")
print("   - created_dayofweek")
print("   - created_month")
print("   - days_since_creation")

# Check target variable
print(f"\nüìä Target Variable: {TARGET_COLUMN}")
print(df[TARGET_COLUMN].describe())

## 5Ô∏è‚É£ Select Features

In [None]:
# Define feature columns (12 key features)
feature_cols = [
    # Location & Organization
    'Plant',
    
    # Product
    'Brand',
    'Product Category',
    'Standard Or Custom',
    
    # Channel & Customer
    'Channel',
    'STRATEGIC_ACCOUNT',
    
    # Status & Process
    'Credit Status',
    'Distribution Status',
    
    # Carrier
    'EWM_CARRIER_CODE',
    
    # Temporal (derived)
    'created_dayofweek',
    'created_month',
    'days_since_creation'
]

# Separate features and target
X = df[feature_cols].copy()
y = df[TARGET_COLUMN].copy()

print(f"‚úÖ Features: {len(feature_cols)} columns")
print(f"‚úÖ Samples: {len(X):,} rows")
print(f"\nFeature list:")
for i, col in enumerate(feature_cols, 1):
    print(f"   {i:2d}. {col}")

## 6Ô∏è‚É£ Encode Categorical Variables

In [None]:
# Categorical columns that need encoding
categorical_cols = [
    'Plant', 'Brand', 'Product Category', 'Standard Or Custom',
    'Channel', 'STRATEGIC_ACCOUNT', 'Credit Status', 
    'Distribution Status', 'EWM_CARRIER_CODE'
]

print("üîÑ Encoding categorical variables...")
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category').cat.codes

print(f"‚úÖ Encoded {len(categorical_cols)} categorical columns")
print(f"\nData types:")
print(X.dtypes)

## 7Ô∏è‚É£ Train/Test Split

In [None]:
# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

print(f"üìä Data Split:")
print(f"   Training:   {len(X_train):,} rows ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Test:       {len(X_test):,} rows ({len(X_test)/len(X)*100:.1f}%)")
print(f"\n   Features:   {X_train.shape[1]}")

## 8Ô∏è‚É£ Train AutoML Model

In [None]:
# Initialize AutoML
automl = AutoML()

# AutoML settings
settings = {
    "time_budget": TIME_BUDGET,
    "task": "regression",
    "metric": "mae",  # Mean Absolute Error
    "estimator_list": ["rf", "xgboost", "extra_tree"],
    "log_file_name": "automl_late_delivery.log",
    "verbose": 1
}

print("üöÄ Training AutoML model...")
print(f"   Time budget: {TIME_BUDGET} seconds")
print(f"   Metric: {settings['metric'].upper()}")
print(f"   Algorithms: {', '.join(settings['estimator_list'])}")
print("\n" + "="*50)

# Train
start_time = datetime.now()
automl.fit(X_train, y_train, **settings)
training_time = (datetime.now() - start_time).total_seconds()

print("="*50)
print(f"‚úÖ Training complete in {training_time:.1f} seconds")
print(f"‚úÖ Best model: {automl.best_estimator}")
print(f"‚úÖ Best MAE: {automl.best_loss:.2f} days")

## 9Ô∏è‚É£ Evaluate Model Performance

In [None]:
# Generate predictions
print("üìä Evaluating model on test set...")
preds = automl.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

# Print results
print("\n" + "="*50)
print("üìà MODEL PERFORMANCE")
print("="*50)
print(f"Mean Absolute Error (MAE):  {mae:.2f} days")
print(f"Root Mean Squared Error:    {rmse:.2f} days")
print(f"R¬≤ Score:                   {r2:.3f}")
print("="*50)

# Late delivery accuracy
late_threshold = 0
y_test_late = (y_test > late_threshold).astype(int)
preds_late = (preds > late_threshold).astype(int)
late_accuracy = (y_test_late == preds_late).mean()

print(f"\nüéØ Late vs On-time Classification:")
print(f"   Accuracy: {late_accuracy*100:.1f}%")

## üîü Visualize Results

In [None]:
# Create visualization
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Actual vs Predicted
ax[0].scatter(y_test, preds, alpha=0.5, s=20)
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
ax[0].set_xlabel('Actual Days Late', fontsize=12)
ax[0].set_ylabel('Predicted Days Late', fontsize=12)
ax[0].set_title(f'Actual vs Predicted (MAE: {mae:.2f} days)', fontsize=14)
ax[0].grid(alpha=0.3)

# Plot 2: Error Distribution
errors = y_test - preds
ax[1].hist(errors, bins=50, edgecolor='black', alpha=0.7)
ax[1].axvline(x=0, color='r', linestyle='--', lw=2, label='Perfect Prediction')
ax[1].set_xlabel('Prediction Error (days)', fontsize=12)
ax[1].set_ylabel('Frequency', fontsize=12)
ax[1].set_title('Error Distribution', fontsize=14)
ax[1].legend()
ax[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Visualizations complete")

## 1Ô∏è‚É£1Ô∏è‚É£ Register Model to MLflow

In [None]:
# Register model with MLflow
print("üì¶ Registering model to MLflow...")

with mlflow.start_run() as run:
    # Log metrics
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("late_accuracy", late_accuracy)
    
    # Log parameters
    mlflow.log_param("sample_size", SAMPLE_SIZE)
    mlflow.log_param("time_budget", TIME_BUDGET)
    mlflow.log_param("best_estimator", automl.best_estimator)
    mlflow.log_param("n_features", len(feature_cols))
    
    # Log model
    mlflow.sklearn.log_model(automl.model, "model")
    model_uri = mlflow.get_artifact_uri("model")
    
    run_id = run.info.run_id
    print(f"‚úÖ MLflow Run ID: {run_id}")

# Register to model registry
model_version = mlflow.register_model(model_uri, MODEL_NAME)

print(f"\n‚úÖ Model registered: {MODEL_NAME}")
print(f"‚úÖ Model URI: {model_uri}")
print(f"‚úÖ Version: {model_version.version}")

## ‚úÖ Training Complete!

**Next Steps:**
1. Run notebook `03_batch_scoring_pipeline.ipynb` to generate predictions for open deliveries
2. View predictions in Power BI report

**Model Summary:**
- Model Name: POC-LateDelivery-Regression-AutoML
- Target: AGE_REQ_DATE (days late/early)
- Features: 12 key columns
- Algorithm: AutoML (Random Forest, XGBoost, Extra Trees)
- Performance: Check MAE, RMSE, R¬≤ scores above