# Azure Machine Learning AutoML Training Notebook (SDK v2)

This notebook provides an interactive version of the AutoML training pipeline with better storage integration and simplified APIs using Azure ML SDK v2.

## Overview
- Load configuration from YAML
- Connect to Azure ML workspace
- Load and preprocess data
- Create or retrieve data assets
- Configure AutoML job
- Submit and monitor training
- Generate model explanations

## 1. Import Required Libraries

In [None]:
from azure.ai.ml import MLClient, Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import automl
from azure.ai.ml.entities import Environment, Data
from azure.identity import DefaultAzureCredential, AzureCliCredential, InteractiveBrowserCredential
import pandas as pd
import logging
import os
import yaml
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from feature_engineering import FeatureEngineer, validate_data

# Interpretability imports
try:
    from interpret.ext.blackbox import TabularExplainer
    import shap
    import matplotlib.pyplot as plt
    import seaborn as sns
    INTERPRETABILITY_AVAILABLE = True
    print("✓ Interpretability libraries loaded")
except ImportError as e:
    print(f"⚠ Interpretability libraries not available: {e}")
    INTERPRETABILITY_AVAILABLE = False

print("✓ All required libraries imported")

## 2. Setup Logging Configuration

In [None]:
def setup_logging(config: dict) -> logging.Logger:
    """Setup logging configuration"""
    log_level = config.get('output', {}).get('log_level', 'INFO')
    log_to_file = config.get('output', {}).get('log_to_file', False)
    log_file_path = config.get('output', {}).get('log_file_path', './logs/training.log')
    
    # Create logs directory if needed
    if log_to_file:
        Path(log_file_path).parent.mkdir(parents=True, exist_ok=True)
    
    # Configure logging
    handlers = [logging.StreamHandler()]
    if log_to_file:
        handlers.append(logging.FileHandler(log_file_path))
    
    logging.basicConfig(
        level=getattr(logging, log_level),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=handlers,
        force=True  # Override existing configuration
    )
    
    return logging.getLogger(__name__)

print("✓ Logging setup function defined")

## 3. Load Configuration

In [None]:
# Load environment variables
load_dotenv()

# Load configuration from YAML
config_path = "config.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Setup logging
logger = setup_logging(config)

print("=" * 70)
print("Azure ML AutoML Training (SDK v2)")
print(f"Experiment: {config['experiment']['name']}")
print(f"Task Type: {config['automl']['task']}")
print("=" * 70)

## 4. Azure Authentication

In [None]:
def get_credential(auth_method: str):
    """Get appropriate Azure credential based on configuration"""
    print(f"Authentication method: {auth_method}")
    
    if auth_method == "cli":
        print("Using Azure CLI authentication")
        return AzureCliCredential()
    elif auth_method == "interactive":
        print("Using Interactive Browser authentication")
        return InteractiveBrowserCredential()
    else:  # default
        print("Using Default Azure Credential (tries multiple methods)")
        return DefaultAzureCredential()

# Get credentials
auth_config = config.get('authentication', {})
auth_method = auth_config.get('method', 'default').lower()
credential = get_credential(auth_method)

print("✓ Credentials obtained")

## 5. Connect to Azure ML Workspace

In [None]:
# Get workspace details from environment
subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
resource_group = os.getenv("AZURE_RESOURCE_GROUP")
workspace_name = os.getenv("AZURE_ML_WORKSPACE")

if not all([subscription_id, resource_group, workspace_name]):
    raise ValueError("Missing Azure credentials in .env file. Please check AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, and AZURE_ML_WORKSPACE")

print("Connecting to Azure ML Workspace...")
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

print(f"✓ Connected to workspace: {workspace_name}")
print(f"  Resource Group: {resource_group}")
print(f"  Subscription: {subscription_id}")

## 6. Data Loading and Preprocessing

In [None]:
def load_and_preprocess_data(config: dict) -> pd.DataFrame:
    """Load and preprocess data according to configuration"""
    data_config = config['data']
    input_path = data_config['input_path']
    
    print(f"Loading data from {input_path}")
    
    # Determine file type and load appropriately
    if input_path.endswith('.csv'):
        df = pd.read_csv(input_path)
    elif input_path.endswith('.parquet'):
        df = pd.read_parquet(input_path)
    elif input_path.endswith('.xlsx'):
        df = pd.read_excel(input_path)
    else:
        raise ValueError(f"Unsupported file format: {input_path}")
    
    print(f"✓ Data loaded. Shape: {df.shape}")
    
    # Validate data
    validation_config = data_config.get('validation', {})
    if validation_config:
        print("Validating data...")
        df = validate_data(df, validation_config)
        print(f"✓ Data validated. Shape after validation: {df.shape}")
    
    # Drop specified columns
    columns_to_drop = data_config.get('columns_to_drop', [])
    if columns_to_drop:
        existing_cols = [col for col in columns_to_drop if col in df.columns]
        if existing_cols:
            df = df.drop(columns=existing_cols)
            print(f"✓ Dropped {len(existing_cols)} columns")
    
    # Apply feature engineering
    fe_config = config.get('feature_engineering', {})
    if fe_config.get('enabled', True):
        print("Applying feature engineering...")
        engineer = FeatureEngineer(fe_config)
        df = engineer.apply_transformations(df)
        print(f"✓ Feature engineering complete. Final shape: {df.shape}")
    
    return df

# Check if we should use existing dataset or load from file
data_config = config['data']
use_existing = data_config.get('use_existing_dataset', False)

if use_existing:
    print(f"Will use existing dataset: {data_config['dataset_name']}")
    df = None
else:
    df = load_and_preprocess_data(config)
    print("\nDataset Info:")
    print(df.info())
    print("\nFirst few rows:")
    display(df.head())

## 7. Save Preprocessed Data (if applicable)

In [None]:
if df is not None:
    output_config = config.get('output', {})
    if output_config.get('save_preprocessed_data', True):
        preprocessed_dir = Path(output_config.get('preprocessed_data_path', './data/preprocessed'))
        preprocessed_dir.mkdir(parents=True, exist_ok=True)
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        preprocessed_path = preprocessed_dir / f"preprocessed_data_{timestamp}.csv"
        df.to_csv(preprocessed_path, index=False)
        print(f"✓ Preprocessed data saved to: {preprocessed_path}")
    else:
        # Create temporary file
        preprocessed_path = f"temp_preprocessed_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(preprocessed_path, index=False)
        print(f"✓ Temporary file created: {preprocessed_path}")
else:
    preprocessed_path = None
    print("Using existing dataset - no preprocessing needed")

## 8. Get or Create Data Asset

In [None]:
def get_or_create_data_asset(ml_client: MLClient, data_path: str, config: dict) -> Data:
    """Get existing data asset or create new one"""
    data_config = config['data']
    dataset_name = data_config['dataset_name']
    use_existing = data_config.get('use_existing_dataset', False)
    
    if use_existing:
        # Get existing data asset by name
        dataset_version = data_config.get('dataset_version', 'latest')
        
        if dataset_version == 'latest':
            version_str = None
        else:
            version_str = str(dataset_version)
        
        print(f"Getting existing data asset: {dataset_name} (version: {dataset_version})")
        
        try:
            if version_str:
                data_asset = ml_client.data.get(name=dataset_name, version=version_str)
            else:
                data_asset = ml_client.data.get(name=dataset_name, label="latest")
            
            print(f"✓ Data asset obtained: {dataset_name} v{data_asset.version}")
            return data_asset
            
        except Exception as e:
            print(f"❌ Failed to get data asset '{dataset_name}': {str(e)}")
            print("Tip: Verify the data asset exists in Azure ML Studio")
            raise
    
    # Create new data asset from local file
    print(f"Creating new data asset from: {data_path}")
    
    data_asset = Data(
        name=dataset_name,
        description=data_config.get('dataset_description', 'Training dataset'),
        path=data_path,
        type=AssetTypes.URI_FILE
    )
    
    # SDK v2 automatically uploads and registers the data asset
    print("Uploading and registering data asset...")
    data_asset = ml_client.data.create_or_update(data_asset)
    
    print(f"✓ Data asset created: {dataset_name} v{data_asset.version}")
    return data_asset

# Get or create the data asset
data_asset = get_or_create_data_asset(ml_client, str(preprocessed_path) if preprocessed_path else None, config)
print(f"\nData Asset Details:")
print(f"  Name: {data_asset.name}")
print(f"  Version: {data_asset.version}")
print(f"  Type: {data_asset.type}")

## 9. Get or Create Environment

In [None]:
def get_or_create_environment(ml_client: MLClient, env_name: str, conda_file_path: str) -> Environment:
    """Get existing environment or create new one from conda file"""
    env_version = "1"
    
    # Try to get existing environment
    try:
        print(f"Checking for existing environment: {env_name}:{env_version}")
        env = ml_client.environments.get(name=env_name, version=env_version)
        print(f"✓ Environment found: {env_name}:{env_version}")
        return env
    except Exception as e:
        print(f"Environment not found, creating new one: {str(e)}")
    
    # Create new environment from conda file
    print(f"Creating new environment from: {conda_file_path}")
    
    # Verify conda file exists
    if not os.path.exists(conda_file_path):
        raise FileNotFoundError(f"Conda file not found: {conda_file_path}")
    
    # Create environment with curated base image
    env = Environment(
        name=env_name,
        description="AutoML training environment with SDK v2",
        conda_file=conda_file_path,
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
        version=env_version
    )
    
    print("Registering environment with Azure ML workspace...")
    env = ml_client.environments.create_or_update(env)
    
    print(f"✓ Environment created: {env_name}:{env.version}")
    return env

# Get or create environment
env_name = config.get('environment', {}).get('name', 'secondary-cvd-risk')
conda_file = config.get('environment', {}).get('conda_file', 'conda_env_v_1_0_0.yml')
environment = get_or_create_environment(ml_client, env_name, conda_file)

print(f"\nEnvironment Details:")
print(f"  Name: {environment.name}")
print(f"  Version: {environment.version}")

## 10. Create AutoML Job

In [None]:
def create_automl_job(ml_client: MLClient, data_asset: Data, config: dict, environment: Environment = None):
    """Create AutoML classification/regression/forecasting job using SDK v2"""
    automl_cfg = config['automl']
    data_config = config['data']
    compute_config = config['compute']
    experiment_config = config['experiment']
    
    task_type = automl_cfg['task'].lower()
    
    print(f"Creating AutoML {task_type} job")
    if environment:
        print(f"Using environment: {environment.name}:{environment.version}")
    
    # Prepare training data input
    training_data_input = Input(
        type=AssetTypes.MLTABLE if data_asset.type == AssetTypes.MLTABLE else AssetTypes.URI_FILE,
        path=f"azureml:{data_asset.name}:{data_asset.version}"
    )
    
    # Create base job configuration based on task type
    if task_type == "classification":
        job = automl.classification(
            compute=compute_config['cluster_name'],
            experiment_name=experiment_config['name'],
            training_data=training_data_input,
            target_column_name=data_config['label_column'],
            primary_metric=automl_cfg['primary_metric'],
            n_cross_validations=automl_cfg['training'].get('n_cross_validations', 5),
            enable_model_explainability=True,
            tags={"framework": "AutoML", "sdk_version": "v2"}
        )
    elif task_type == "regression":
        job = automl.regression(
            compute=compute_config['cluster_name'],
            experiment_name=experiment_config['name'],
            training_data=training_data_input,
            target_column_name=data_config['label_column'],
            primary_metric=automl_cfg['primary_metric'],
            n_cross_validations=automl_cfg['training'].get('n_cross_validations', 5),
            enable_model_explainability=True,
            tags={"framework": "AutoML", "sdk_version": "v2"}
        )
    elif task_type == "forecasting":
        job = automl.forecasting(
            compute=compute_config['cluster_name'],
            experiment_name=experiment_config['name'],
            training_data=training_data_input,
            target_column_name=data_config['label_column'],
            primary_metric=automl_cfg['primary_metric'],
            n_cross_validations=automl_cfg['training'].get('n_cross_validations', 5),
            tags={"framework": "AutoML", "sdk_version": "v2"}
        )
    else:
        raise ValueError(f"Unsupported task type: {task_type}")
    
    # Set training limits
    training_cfg = automl_cfg['training']
    job.set_limits(
        timeout_minutes=training_cfg.get('experiment_timeout_minutes', 60),
        trial_timeout_minutes=training_cfg.get('iteration_timeout_minutes', 20),
        max_trials=training_cfg.get('max_concurrent_iterations', 4),
        enable_early_termination=training_cfg.get('enable_early_stopping', True)
    )
    
    # Configure featurization
    featurization_cfg = automl_cfg.get('featurization', {})
    if featurization_cfg.get('mode') == 'off':
        job.set_featurization(enable_dnn_featurization=False)
    
    # Set allowed/blocked models
    models_cfg = automl_cfg.get('models', {})
    if models_cfg.get('allowed'):
        job.set_training(allowed_training_algorithms=models_cfg['allowed'])
    if models_cfg.get('blocked'):
        job.set_training(blocked_training_algorithms=models_cfg['blocked'])
    
    print(f"✓ AutoML job configured: {task_type.capitalize()}, Primary Metric: {automl_cfg['primary_metric']}")
    return job

# Create the AutoML job
job = create_automl_job(ml_client, data_asset, config, environment)
print("\n✓ AutoML job created and ready to submit")

## 11. Submit AutoML Job

In [None]:
print("Submitting AutoML job...")

returned_job = ml_client.jobs.create_or_update(job)

print("=" * 70)
print("✓ Job submitted successfully!")
print("=" * 70)
print(f"  Job name: {returned_job.name}")
print(f"  Job ID: {returned_job.id}")
print(f"  Status: {returned_job.status}")
print(f"  Studio URL: {returned_job.studio_url}")
print("=" * 70)

## 12. Monitor Job Progress (Optional)

In [None]:
# Uncomment to monitor the job and wait for completion
# This will block until the job completes

print("Monitoring job progress...")
print("You can monitor the job in Azure ML Studio:")
print(f"  {returned_job.studio_url}")
print("")
print("Waiting for job completion...")
print("(You can safely interrupt this cell and monitor in the portal)")
print("=" * 70)

# Stream job logs (this will block until completion)
ml_client.jobs.stream(returned_job.name)

# Get final job status
job_status = ml_client.jobs.get(returned_job.name)

print("=" * 70)
print(f"Job Status: {job_status.status}")
print("=" * 70)

## 13. Get Job Results

In [None]:
# Get the final job status
final_job = ml_client.jobs.get(returned_job.name)

print("=" * 70)
print("Job Results")
print("=" * 70)
print(f"Job Name: {final_job.name}")
print(f"Status: {final_job.status}")
print(f"Studio URL: {final_job.studio_url}")
print("=" * 70)

if final_job.status == "Completed":
    print("\n✓ Training completed successfully!")
    print("\nNext steps:")
    print("1. View detailed results in Azure ML Studio")
    print("2. Download the best model")
    print("3. Generate model explanations (see next cell)")
    print("4. Deploy the model for inference")
else:
    print(f"\n⚠ Job finished with status: {final_job.status}")
    print("Check the Studio URL for more details.")

## 14. Generate Model Explanations (Optional)

In [None]:
if not INTERPRETABILITY_AVAILABLE:
    print("⚠ Interpretability libraries not available.")
    print("Install with: pip install interpret-community shap")
else:
    print("Model explanation template:")
    print("")
    print("# After downloading the best model, you can generate explanations:")
    print("")
    print("# 1. Download and load the best model")
    print("# best_model = ml_client.models.download(name='your_model_name', version='latest', download_path='./models')")
    print("")
    print("# 2. Load and preprocess the data (same as training)")
    print("# df = load_and_preprocess_data(config)")
    print("# feature_columns = [col for col in df.columns if col != data_config['label_column']]")
    print("# X = df[feature_columns]")
    print("# y = df[data_config['label_column']]")
    print("")
    print("# 3. Create SHAP explainer")
    print("# explainer = shap.Explainer(best_model, X.sample(min(1000, len(X))))")
    print("# shap_values = explainer(X.sample(min(100, len(X))))")
    print("")
    print("# 4. Generate SHAP plots")
    print("# shap.summary_plot(shap_values, X.sample(min(100, len(X))), show=True)")
    print("")
    print("For complete explanation code, see generate_explanations.py")

## 15. Cleanup (Optional)

In [None]:
# Clean up temporary files if needed
if preprocessed_path and not config.get('output', {}).get('save_preprocessed_data', True):
    if Path(preprocessed_path).exists():
        Path(preprocessed_path).unlink()
        print(f"✓ Temporary file removed: {preprocessed_path}")
else:
    print("No temporary files to clean up")