## üì¶ Step 1: Install Dependencies

In [None]:
%pip install -U semantic-link --q

## üîß Step 2: Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Machine Learning
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Experiment tracking
import mlflow
from mlflow.tracking import MlflowClient

# Semantic Link - Connect to Power BI
import sempy.fabric as fabric

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# File handling for encoders
import pickle
import tempfile
import os
from datetime import datetime, timedelta

import sklearn

print("‚úÖ All libraries imported")
print(f"   scikit-learn version: {sklearn.__version__}")

## ‚öôÔ∏è Step 3: Configuration

In [None]:
# Workspace
ws = fabric.get_workspace_id()

# Semantic model name - UPDATE THIS
SEMANTIC_MODEL = "Deliveries Aging - Open and Closed"

# Model configuration
MODEL_NAME = "delivery_lateness_predictor_fixed"
TARGET_COLUMN = "AGE_REQ_DATE"

print("‚úÖ Configuration loaded")
print(f"   Workspace ID: {ws}")
print(f"   Semantic Model: {SEMANTIC_MODEL}")
print(f"   Model Name: {MODEL_NAME}")
print(f"   Target: {TARGET_COLUMN}")

## üìä Step 4: Load Training Data (Closed Deliveries)

In [None]:
# Query closed deliveries (those with GI Date)
dax_query = """
EVALUATE
FILTER(
    Aging,
    NOT(ISBLANK(Aging[GI Date])) &&
    NOT(ISBLANK(Aging[Req. Date Header]))
)
"""

df_closed = fabric.evaluate_dax(dataset=SEMANTIC_MODEL, dax_string=dax_query, workspace=ws)

# Clean column names (remove DAX table prefixes)
df_closed.columns = [col.split('[')[-1].replace(']', '') if '[' in col else col for col in df_closed.columns]

print(f"‚úÖ Loaded {len(df_closed):,} closed deliveries for training")
print(f"   Columns: {df_closed.shape[1]}")
print(f"\nüìä Sample data:")
df_closed.head()

## üîë Step 5: Feature Engineering with Saved Encoders

**CRITICAL FIX:** Using `LabelEncoder` instead of `.cat.codes` to ensure consistent encoding between training and scoring.

In [None]:
# Define features
feature_cols = [
    "Channel",
    "Delivery Priority",
    "EWM Shipping Condition",
    "Shipping Point",
    "Sold To Name 1",
    "Standard Or Custom",
    "Product Category"
]

# Define target
target_col = "AGE_REQ_DATE"

# Extract features and target
X = df_closed[feature_cols].copy()
y = df_closed[target_col].copy()

# ==============================================================================
# ENCODE CATEGORICAL VARIABLES WITH SAVED ENCODERS
# ==============================================================================
categorical_cols = feature_cols  # All features are categorical

# Create and fit encoders
encoders = {}
for col in categorical_cols:
    X[col] = X[col].fillna('Unknown')
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le
    print(f"‚úÖ Encoded {col}: {len(le.classes_)} unique categories")

# Save encoders to temporary file (will be logged to MLflow later)
temp_dir = tempfile.mkdtemp()
encoder_path = os.path.join(temp_dir, 'encoders.pkl')
with open(encoder_path, 'wb') as f:
    pickle.dump(encoders, f)

print(f"\n‚úÖ Saved {len(encoders)} encoders to: {encoder_path}")
print(f"‚úÖ Features: {X.shape[1]} columns, {X.shape[0]:,} rows")
print(f"‚úÖ Target: {y.shape[0]:,} values")

## üîÄ Step 6: Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"‚úÖ Train: {len(X_train):,} samples")
print(f"‚úÖ Test:  {len(X_test):,} samples")
print(f"‚úÖ Features: {X_train.shape[1]}")

## ü§ñ Step 7: Train AutoML Model

In [None]:
# AutoML settings
automl = AutoML()

settings = {
    "time_budget": 60,  # 1 minute for quick training
    "task": "regression",
    "metric": "mae",
    "estimator_list": [
        "rf",        # RandomForestRegressor
        "xgboost",   # XGBoostRegressor
        "extra_tree" # ExtraTreesRegressor
    ],
    "log_file_name": "automl_training.log",
    "seed": 42
}

print("üöÄ Starting AutoML training...")
automl.fit(X_train=X_train, y_train=y_train, **settings)

print(f"\n‚úÖ Training complete!")
print(f"   Best model: {automl.best_estimator}")
print(f"   Best MAE: {automl.best_loss:.3f} days")

## üìà Step 8: Evaluate Model Performance

In [None]:
# Make predictions on test set
preds = automl.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

# Display results
print("\n" + "="*60)
print("MODEL PERFORMANCE")
print("="*60)
print(f"MAE:  {mae:.3f} days  ‚Üê Average prediction error")
print(f"RMSE: {rmse:.3f} days  ‚Üê Larger errors penalized more")
print(f"R¬≤:   {r2:.3f}        ‚Üê Variance explained (higher is better)")
print("="*60)

if mae < 1.0:
    print("\n‚úÖ EXCELLENT performance! MAE < 1 day is production-ready.")
elif mae < 2.0:
    print("\n‚úÖ GOOD performance! MAE < 2 days is acceptable.")
else:
    print("\n‚ö†Ô∏è Model may need improvement. Consider adding more features.")

## üíæ Step 9: Register Model in MLflow WITH Encoders

**CRITICAL:** This cell logs the encoders alongside the model so they can be loaded during scoring.

In [None]:
client = MlflowClient()

with mlflow.start_run(run_name=f"{MODEL_NAME}_training") as run:
    
    # Log training metrics
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    
    # Log parameters
    mlflow.log_param("n_features", len(feature_cols))
    mlflow.log_param("n_train_samples", len(X_train))
    mlflow.log_param("n_test_samples", len(X_test))
    mlflow.log_param("sklearn_version", sklearn.__version__)
    mlflow.log_param("estimator", automl.best_estimator)
    
    # LOG ENCODERS (CRITICAL FIX)
    mlflow.log_artifact(encoder_path, artifact_path="encoders")
    
    # Log the sklearn model
    mlflow.sklearn.log_model(
        sk_model=automl.model,
        artifact_path="model"
    )

    model_uri = f"runs:/{run.info.run_id}/model"
    run_id = run.info.run_id

# Register the model
registered = mlflow.register_model(
    model_uri=model_uri,
    name=MODEL_NAME
)

print(f"\n‚úÖ Model registered: {registered.name} v{registered.version}")
print(f"‚úÖ Encoders saved with model")
print(f"‚úÖ Run ID: {run_id}")
print(f"\nüéØ Model is ready for scoring!")

## üìä Step 10: Visualize Model Performance (Optional)

In [None]:
# Scatter plot: Predicted vs Actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, preds, alpha=0.5, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect predictions')
plt.xlabel('Actual AGE_REQ_DATE (days)', fontsize=12)
plt.ylabel('Predicted AGE_REQ_DATE (days)', fontsize=12)
plt.title(f'Prediction vs Actual (MAE: {mae:.2f} days)', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## üìä Step 11: Feature Importance (Optional)

In [None]:
if hasattr(automl.model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': automl.model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
    plt.title('Feature Importance for Delivery Lateness Prediction', fontsize=14)
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    print("\nTop 3 Most Important Features:")
    for idx, row in feature_importance.head(3).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")
else:
    print("Feature importances not available for this model type.")

---

# üéØ SCORING PHASE: Generate Predictions for Open Deliveries

---

## üîÑ Step 12: Load Model from MLflow

In [None]:
# Load the latest version of the model
model_uri = f"models:/{MODEL_NAME}/latest"
model = mlflow.sklearn.load_model(model_uri)

print(f"‚úÖ Model loaded: {type(model).__name__}")
print(f"   URI: {model_uri}")

## üì• Step 13: Load Open Deliveries for Scoring

In [None]:
# Query open deliveries (those WITHOUT GI Date)
dax_query = """
EVALUATE
FILTER(
    Aging,
    ISBLANK(Aging[GI Date]) &&
    NOT(ISBLANK(Aging[Delivery Created On])) &&
    NOT(ISBLANK(Aging[Req. Date Header]))
)
"""

df_open = fabric.evaluate_dax(dataset=SEMANTIC_MODEL, dax_string=dax_query, workspace=ws)

# Clean column names
df_open.columns = [col.split('[')[-1].replace(']', '') if '[' in col else col for col in df_open.columns]

print(f"‚úÖ Loaded {len(df_open):,} open deliveries for scoring")
print(f"   Columns: {df_open.shape[1]}")
print(f"\nüìä Sample data:")
df_open.head()

## üîë Step 14: Load Saved Encoders and Prepare Features

**CRITICAL FIX:** Loading the same encoders used during training to ensure consistent categorical encoding.

In [None]:
# ==============================================================================
# LOAD SAVED ENCODERS FROM MLFLOW
# ==============================================================================
print("üì¶ Loading saved encoders from MLflow...\n")

client = MlflowClient()
model_versions = client.search_model_versions(f"name='{MODEL_NAME}'")

if not model_versions:
    raise ValueError(f"‚ùå No model versions found for {MODEL_NAME}")

# Get latest version
latest = sorted(model_versions, key=lambda x: int(x.version))[-1]
run_id = latest.run_id

print(f"   Model name: {MODEL_NAME}")
print(f"   Model version: {latest.version}")
print(f"   Run ID: {run_id}")

# Download encoders artifact
encoder_artifact = client.download_artifacts(run_id, "encoders/encoders.pkl")
with open(encoder_artifact, 'rb') as f:
    encoders = pickle.load(f)

print(f"\n‚úÖ Loaded {len(encoders)} encoders\n")

# ==============================================================================
# APPLY SAVED ENCODERS TO SCORING DATA
# ==============================================================================
# Extract features
X_score = df_open[feature_cols].copy()

# Apply saved encoders
for col in categorical_cols:
    X_score[col] = X_score[col].fillna('Unknown').astype(str)
    
    if col in encoders:
        encoder = encoders[col]
        known_classes = set(encoder.classes_)
        
        # Transform using saved encoder, handle unknown categories
        def safe_encode(value):
            if value in known_classes:
                return encoder.transform([value])[0]
            else:
                # Unknown category -> assign max code + 1
                return len(encoder.classes_)
        
        X_score[col] = X_score[col].apply(safe_encode)
        unique_count = X_score[col].nunique()
        print(f"‚úÖ {col}: {unique_count} unique values (using saved encoder)")
    else:
        print(f"‚ö†Ô∏è No saved encoder for {col}")

print(f"\n‚úÖ Prepared {len(X_score):,} rows for scoring")
print(f"‚úÖ Features: {X_score.shape[1]} columns")

## üîç Step 15: Verify Encoding Quality (Diagnostic)

In [None]:
print("\n" + "="*60)
print("ENCODING QUALITY CHECK")
print("="*60)

print("\nUnique values per feature in scoring data:")
for col in X_score.columns:
    nunique = X_score[col].nunique()
    print(f"  {col}: {nunique} unique values")

unique_rows = X_score.drop_duplicates().shape[0]
total_rows = X_score.shape[0]
uniqueness_pct = (unique_rows / total_rows) * 100

print(f"\nüìä Unique row combinations: {unique_rows:,} out of {total_rows:,} ({uniqueness_pct:.1f}%)")

if unique_rows < 100:
    print("\n‚ö†Ô∏è WARNING: Very few unique rows - encoding may be broken!")
elif uniqueness_pct < 10:
    print("\n‚ö†Ô∏è WARNING: Low uniqueness - many deliveries have identical features")
else:
    print("\n‚úÖ GOOD: Encoding has sufficient variation!")

print("="*60)

## üéØ Step 16: Generate Predictions

In [None]:
# Generate predictions
predictions = model.predict(X_score)

# Add predictions to dataframe
df_open['predicted_age_req_date'] = predictions

# Calculate predicted ship date
df_open['predicted_ship_date'] = (
    pd.to_datetime(df_open['Req. Date Header']) + 
    pd.to_timedelta(df_open['predicted_age_req_date'], unit='d')
)

print(f"\n‚úÖ Generated {len(predictions):,} predictions")
print(f"\nüìä Prediction Statistics:")
print(f"   Mean:   {predictions.mean():.2f} days")
print(f"   Median: {np.median(predictions):.2f} days")
print(f"   Min:    {predictions.min():.2f} days (earliest)")
print(f"   Max:    {predictions.max():.2f} days (latest)")
print(f"   Std:    {predictions.std():.2f} days")

print(f"\nüìà Distribution:")
print(f"   Predicted Early (<0):     {(predictions < 0).sum():,} ({(predictions < 0).sum()/len(predictions)*100:.1f}%)")
print(f"   Predicted On-Time (0-3):  {((predictions >= 0) & (predictions <= 3)).sum():,} ({((predictions >= 0) & (predictions <= 3)).sum()/len(predictions)*100:.1f}%)")
print(f"   Predicted Late (>3):      {(predictions > 3).sum():,} ({(predictions > 3).sum()/len(predictions)*100:.1f}%)")
print(f"   Predicted Very Late (>5): {(predictions > 5).sum():,} ({(predictions > 5).sum()/len(predictions)*100:.1f}%)")

# Check for uniqueness
unique_predictions = len(set(predictions))
print(f"\nüîç Unique prediction values: {unique_predictions:,} out of {len(predictions):,}")

if unique_predictions < 10:
    print("\n‚ùå ERROR: Too few unique predictions - encoding is still broken!")
elif unique_predictions < 100:
    print("\n‚ö†Ô∏è WARNING: Limited prediction variety - check your data")
else:
    print("\n‚úÖ EXCELLENT: Predictions show good variation!")

## üëÄ Step 17: View Sample Predictions

In [None]:
# Display sample predictions
sample_cols = [
    'Delivery Number',
    'Sold To Name 1',
    'Delivery Priority',
    'Product Category',
    'Req. Date Header',
    'predicted_age_req_date',
    'predicted_ship_date'
]

available_sample_cols = [c for c in sample_cols if c in df_open.columns]
sample_df = df_open[available_sample_cols].head(20)

print("\nüìã Sample Predictions (First 20 rows):")
print("="*100)
sample_df

## üíæ Step 18: Save Predictions to Lakehouse

In [None]:
print("\n=== Saving Predictions to Lakehouse ===")

# Select essential columns for Power BI
output_cols = [
    'Delivery Number',
    'Sold To Name 1',
    'Delivery Priority',
    'Product Category',
    'Req. Date Header',
    'predicted_age_req_date',
    'predicted_ship_date'
]

# Filter to columns that exist
available_output_cols = [c for c in output_cols if c in df_open.columns]
predictions_df = df_open[available_output_cols].copy()

# Add metadata
predictions_df['prediction_timestamp'] = datetime.now()
predictions_df['model_name'] = MODEL_NAME
predictions_df['model_version'] = latest.version

# Clean column names for Delta table compatibility
predictions_df.columns = (
    predictions_df.columns
    .str.replace(' ', '_', regex=False)
    .str.replace('.', '', regex=False)
    .str.replace('-', '_', regex=False)
)

# Convert datetime columns to strings
date_cols = predictions_df.select_dtypes(include=['datetime64']).columns
for col in date_cols:
    predictions_df[col] = predictions_df[col].astype(str)

# Save to Lakehouse table
table_name = "delivery_lateness_predictions"
spark_df = spark.createDataFrame(predictions_df)
spark_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

print(f"\n‚úÖ Saved {len(predictions_df):,} predictions to table: {table_name}")
print(f"‚úÖ Columns saved: {len(predictions_df.columns)}")
print(f"\nüìä Prediction Summary:")
print(f"   Early (<0 days):      {(df_open['predicted_age_req_date'] < 0).sum():,}")
print(f"   On-Time (0-3 days):   {((df_open['predicted_age_req_date'] >= 0) & (df_open['predicted_age_req_date'] <= 3)).sum():,}")
print(f"   Late (>3 days):       {(df_open['predicted_age_req_date'] > 3).sum():,}")
print(f"   Very Late (>5 days):  {(df_open['predicted_age_req_date'] > 5).sum():,}")

print(f"\nüéØ Predictions are ready for Power BI!")

---

## ‚úÖ Pipeline Complete!

**What was accomplished:**
1. ‚úÖ Trained AutoML model with MAE ~0.6 days
2. ‚úÖ Saved categorical encoders with the model
3. ‚úÖ Loaded encoders during scoring for consistency
4. ‚úÖ Generated unique predictions for each delivery
5. ‚úÖ Saved predictions to Lakehouse table

**Next Steps for Power BI:**

1. **Add Prediction Table** to your semantic model
2. **Create Relationship**: `Aging[Delivery Number]` ‚Üí `delivery_lateness_predictions[Delivery_Number]`
3. **Add DAX Calculated Columns**:

```dax
// Lateness Category
Lateness Category = 
SWITCH(
    TRUE(),
    delivery_lateness_predictions[predicted_age_req_date] < 0, "Early",
    delivery_lateness_predictions[predicted_age_req_date] <= 3, "On-Time",
    "Late"
)

// At Risk Flag
At Risk = 
delivery_lateness_predictions[predicted_age_req_date] > 3
```

4. **Build Dashboards** for:
   - At-risk deliveries requiring customer outreach
   - Predicted lateness by customer/product/shipping point
   - Daily operations prioritization

5. **Schedule This Notebook** to run daily at 6 AM for fresh predictions

---