# Vertex AI Pipelines - End-to-End MLOps Orchestration

This notebook demonstrates comprehensive pipeline orchestration for MLOps workflows using both simple local execution and advanced Vertex AI Pipelines.

## Features Covered
- Simple local pipeline execution
- Advanced Vertex AI Pipelines integration 
- End-to-end MLOps workflow orchestration
- Pipeline monitoring and management
- Component composition and reusability

**Author:** MLOps Team  
**Version:** 1.0.0

In [None]:
import sys
import os
import logging
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path
sys.path.append('..')

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

print("üì¶ Libraries imported successfully")

## 1. Pipeline Orchestration Setup

In [None]:
# Import pipeline orchestration modules
from src.pipelines import (
    SimplePipeline, LocalPipelineRunner, SimplePipelineConfig,
    PipelineStep, PipelineResult, StepStatus, PipelineType,
    create_pipeline_runner, run_sample_pipeline
)

from src.config import Config
from src.utils import setup_logging

# Initialize configuration
config = Config()
logger = setup_logging(__name__)

print("üöÄ Pipeline orchestration modules imported")
print(f"üìÅ Working directory: {os.getcwd()}")

## 2. Simple Local Pipeline Execution

First, let's demonstrate simple local pipeline execution for development and testing.

In [None]:
# Create pipeline runner
runner = create_pipeline_runner()

# Configure simple training pipeline
training_config = SimplePipelineConfig(
    name="iris_training_pipeline",
    description="Iris classification training pipeline",
    pipeline_type=PipelineType.TRAINING,
    parameters={
        'algorithm': 'random_forest',
        'output_model_path': './models/iris_pipeline_model.joblib',
        'random_state': 42
    },
    fail_fast=True,
    enable_retries=True
)

print(f"üìã Training pipeline configured: {training_config.name}")
print(f"üéØ Algorithm: {training_config.parameters['algorithm']}")
print(f"üíæ Output path: {training_config.parameters['output_model_path']}")

In [None]:
# Create training pipeline
training_pipeline = runner.create_training_pipeline(training_config)

print(f"üèóÔ∏è  Pipeline created with {len(training_pipeline.steps)} steps:")
for i, step in enumerate(training_pipeline.steps, 1):
    print(f"   {i}. {step.name}: {step.description}")

print(f"\n‚öôÔ∏è  Pipeline configuration:")
print(f"   - Fail fast: {training_config.fail_fast}")
print(f"   - Retries enabled: {training_config.enable_retries}")
print(f"   - Pipeline type: {training_config.pipeline_type.value}")

In [None]:
# Execute training pipeline
print("üöÄ Starting pipeline execution...")
start_time = datetime.now()

result = runner.run_pipeline(training_config.name)

execution_time = datetime.now() - start_time

print(f"\n‚úÖ Pipeline execution completed!")
print(f"üìä Results:")
print(f"   - Status: {result.status}")
print(f"   - Steps completed: {result.steps_completed}/{result.steps_total}")
print(f"   - Success rate: {result.success_rate:.1%}")
print(f"   - Duration: {result.duration_seconds:.2f}s")
print(f"   - Execution time: {execution_time.total_seconds():.2f}s")

In [None]:
# Display pipeline metrics
if result.metrics:
    print("üìà Model Performance Metrics:")
    for metric_name, value in result.metrics.items():
        print(f"   - {metric_name.title()}: {value:.4f}")
    
    # Create metrics visualization
    fig, ax = plt.subplots(figsize=(10, 6))
    
    metrics_df = pd.DataFrame([
        {'Metric': k.title(), 'Value': v} 
        for k, v in result.metrics.items()
    ])
    
    bars = ax.bar(metrics_df['Metric'], metrics_df['Value'], 
                  color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'])
    
    ax.set_title('Model Performance Metrics', fontsize=16, fontweight='bold')
    ax.set_ylabel('Score', fontsize=12)
    ax.set_ylim(0, 1)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("‚ÑπÔ∏è  No metrics available in pipeline result")

## 3. Deployment Pipeline

Now let's create and execute a deployment pipeline.

In [None]:
# Configure deployment pipeline
deployment_config = SimplePipelineConfig(
    name="iris_deployment_pipeline",
    description="Iris model deployment pipeline",
    pipeline_type=PipelineType.DEPLOYMENT,
    parameters={
        'model_path': './models/iris_pipeline_model.joblib',
        'endpoint_name': 'iris-classification-endpoint',
        'machine_type': 'n1-standard-2',
        'min_replicas': 1,
        'max_replicas': 3
    }
)

# Create deployment pipeline
deployment_pipeline = runner.create_deployment_pipeline(deployment_config)

print(f"üö¢ Deployment pipeline created with {len(deployment_pipeline.steps)} steps:")
for i, step in enumerate(deployment_pipeline.steps, 1):
    print(f"   {i}. {step.name}: {step.description}")

In [None]:
# Execute deployment pipeline
print("üöÄ Starting deployment pipeline...")

deployment_result = runner.run_pipeline(deployment_config.name)

print(f"\n‚úÖ Deployment pipeline completed!")
print(f"üìä Results:")
print(f"   - Status: {deployment_result.status}")
print(f"   - Steps completed: {deployment_result.steps_completed}/{deployment_result.steps_total}")
print(f"   - Success rate: {deployment_result.success_rate:.1%}")
print(f"   - Duration: {deployment_result.duration_seconds:.2f}s")

if deployment_result.outputs:
    print(f"\nüîó Deployment Outputs:")
    for key, value in deployment_result.outputs.items():
        if isinstance(value, str) and len(value) < 100:
            print(f"   - {key}: {value}")
        else:
            print(f"   - {key}: <{type(value).__name__}>")

## 4. Full End-to-End MLOps Pipeline

Let's create and execute a complete end-to-end MLOps pipeline.

In [None]:
# Configure full MLOps pipeline
mlops_config = SimplePipelineConfig(
    name="full_mlops_pipeline",
    description="Complete end-to-end MLOps pipeline",
    pipeline_type=PipelineType.FULL_MLOPS,
    parameters={
        'data_source': 'iris_dataset',
        'algorithm': 'random_forest',
        'deploy_model': True,
        'endpoint_name': 'mlops-iris-endpoint',
        'model_validation_threshold': 0.85,
        'enable_monitoring': True
    },
    fail_fast=False,  # Continue on errors for demo
    enable_retries=True
)

# Create full MLOps pipeline
mlops_pipeline = runner.create_full_mlops_pipeline(mlops_config)

print(f"üåü Full MLOps pipeline created with {len(mlops_pipeline.steps)} steps:")
for i, step in enumerate(mlops_pipeline.steps, 1):
    print(f"   {i}. {step.name}: {step.description}")

print(f"\n‚öôÔ∏è  Pipeline features:")
print(f"   - Model deployment: {mlops_config.parameters['deploy_model']}")
print(f"   - Monitoring: {mlops_config.parameters['enable_monitoring']}")
print(f"   - Algorithm: {mlops_config.parameters['algorithm']}")
print(f"   - Validation threshold: {mlops_config.parameters['model_validation_threshold']}")

In [None]:
# Execute full MLOps pipeline with progress tracking
print("üöÄ Starting full MLOps pipeline...")
print("üìä Progress will be tracked step by step\n")

# Execute pipeline
mlops_result = runner.run_pipeline(mlops_config.name)

print(f"\nüéâ Full MLOps pipeline completed!")
print(f"üìä Final Results:")
print(f"   - Overall Status: {mlops_result.status.upper()}")
print(f"   - Steps completed: {mlops_result.steps_completed}/{mlops_result.steps_total}")
print(f"   - Success rate: {mlops_result.success_rate:.1%}")
print(f"   - Total duration: {mlops_result.duration_seconds:.2f}s")
print(f"   - Start time: {mlops_result.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"   - End time: {mlops_result.end_time.strftime('%Y-%m-%d %H:%M:%S')}")

## 5. Pipeline Performance Analysis

In [None]:
# Analyze pipeline performance
pipeline_results = {
    'Training Pipeline': result,
    'Deployment Pipeline': deployment_result,
    'Full MLOps Pipeline': mlops_result
}

# Create performance comparison
performance_data = []
for name, res in pipeline_results.items():
    performance_data.append({
        'Pipeline': name,
        'Status': res.status,
        'Steps Completed': res.steps_completed,
        'Total Steps': res.steps_total,
        'Success Rate': res.success_rate,
        'Duration (s)': res.duration_seconds
    })

performance_df = pd.DataFrame(performance_data)
print("üìä Pipeline Performance Comparison:")
print(performance_df.to_string(index=False))

In [None]:
# Visualize pipeline performance
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Success rate comparison
axes[0,0].bar(performance_df['Pipeline'], performance_df['Success Rate'], 
              color=['#3498db', '#e74c3c', '#2ecc71'])
axes[0,0].set_title('Pipeline Success Rates', fontweight='bold')
axes[0,0].set_ylabel('Success Rate')
axes[0,0].set_ylim(0, 1.1)
axes[0,0].tick_params(axis='x', rotation=45)

# Duration comparison
axes[0,1].bar(performance_df['Pipeline'], performance_df['Duration (s)'], 
              color=['#f39c12', '#9b59b6', '#1abc9c'])
axes[0,1].set_title('Pipeline Execution Duration', fontweight='bold')
axes[0,1].set_ylabel('Duration (seconds)')
axes[0,1].tick_params(axis='x', rotation=45)

# Steps completion
x = range(len(performance_df))
width = 0.35
axes[1,0].bar([i - width/2 for i in x], performance_df['Steps Completed'], width, 
              label='Completed', color='#2ecc71')
axes[1,0].bar([i + width/2 for i in x], performance_df['Total Steps'], width, 
              label='Total', color='#34495e')
axes[1,0].set_title('Pipeline Steps Completion', fontweight='bold')
axes[1,0].set_ylabel('Number of Steps')
axes[1,0].set_xticks(x)
axes[1,0].set_xticklabels(performance_df['Pipeline'], rotation=45)
axes[1,0].legend()

# Pipeline status pie chart
status_counts = performance_df['Status'].value_counts()
colors = ['#2ecc71' if status == 'completed' else '#e74c3c' for status in status_counts.index]
axes[1,1].pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%',
              colors=colors, startangle=90)
axes[1,1].set_title('Pipeline Status Distribution', fontweight='bold')

plt.tight_layout()
plt.show()

## 6. Pipeline Management and Monitoring

In [None]:
# List all pipelines and their results
all_pipelines = runner.list_pipelines()
print(f"üìã Registered Pipelines ({len(all_pipelines)}):")

for i, pipeline_name in enumerate(all_pipelines, 1):
    result = runner.get_pipeline_result(pipeline_name)
    if result:
        print(f"   {i}. {pipeline_name}")
        print(f"      - Status: {result.status}")
        print(f"      - Duration: {result.duration_seconds:.2f}s")
        print(f"      - Success Rate: {result.success_rate:.1%}")
        if result.error_message:
            print(f"      - Error: {result.error_message[:100]}...")
    else:
        print(f"   {i}. {pipeline_name} (No results available)")
    print()

In [None]:
# Pipeline step analysis
if len(mlops_pipeline.steps) > 0:
    print("üîç Detailed Step Analysis (Full MLOps Pipeline):")
    
    step_analysis = []
    for i, step in enumerate(mlops_pipeline.steps, 1):
        duration = 0
        if step.start_time and step.end_time:
            duration = (step.end_time - step.start_time).total_seconds()
        
        step_analysis.append({
            'Step': f"{i}. {step.name}",
            'Status': step.status.value,
            'Duration (s)': f"{duration:.2f}",
            'Retries': step.retry_count,
            'Description': step.description[:50] + '...' if len(step.description) > 50 else step.description
        })
    
    step_df = pd.DataFrame(step_analysis)
    print(step_df.to_string(index=False))
    
    # Step status visualization
    status_colors = {
        'completed': '#2ecc71',
        'failed': '#e74c3c',
        'pending': '#f39c12',
        'running': '#3498db',
        'skipped': '#95a5a6'
    }
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    step_statuses = [step.status.value for step in mlops_pipeline.steps]
    colors = [status_colors.get(status, '#34495e') for status in step_statuses]
    
    bars = ax.bar(range(len(step_statuses)), [1] * len(step_statuses), color=colors)
    ax.set_title('Pipeline Step Status Overview', fontsize=16, fontweight='bold')
    ax.set_xlabel('Pipeline Steps')
    ax.set_ylabel('Status')
    ax.set_xticks(range(len(mlops_pipeline.steps)))
    ax.set_xticklabels([f"{i+1}. {step.name}" for i, step in enumerate(mlops_pipeline.steps)], 
                       rotation=45, ha='right')
    ax.set_yticks([])
    
    # Add legend
    unique_statuses = list(set(step_statuses))
    legend_elements = [plt.Rectangle((0,0),1,1, facecolor=status_colors.get(status, '#34495e'), 
                                   label=status.title()) for status in unique_statuses]
    ax.legend(handles=legend_elements, loc='upper right')
    
    plt.tight_layout()
    plt.show()

## 7. Advanced Pipeline Features

Demonstrate advanced pipeline features and integrations.

In [None]:
# Custom pipeline step example
def custom_data_quality_check(**kwargs):
    """Custom data quality check step."""
    logger.info("Executing custom data quality check")
    
    data = kwargs.get('data', kwargs.get('processed_data'))
    if data is None:
        raise ValueError("No data provided for quality check")
    
    # Perform quality checks
    quality_metrics = {
        'null_percentage': (data.isnull().sum().sum() / (len(data) * len(data.columns))) * 100,
        'duplicate_percentage': (data.duplicated().sum() / len(data)) * 100,
        'data_completeness': ((len(data) * len(data.columns) - data.isnull().sum().sum()) / 
                             (len(data) * len(data.columns))) * 100
    }
    
    # Quality score
    quality_score = (
        (100 - quality_metrics['null_percentage']) * 0.4 +
        (100 - quality_metrics['duplicate_percentage']) * 0.3 +
        quality_metrics['data_completeness'] * 0.3
    ) / 100
    
    quality_passed = quality_score > 0.8  # 80% threshold
    
    return {
        'quality_metrics': quality_metrics,
        'quality_score': quality_score,
        'quality_passed': quality_passed,
        'data': data  # Pass through data
    }

# Create custom pipeline with quality check
custom_config = SimplePipelineConfig(
    name="custom_quality_pipeline",
    description="Pipeline with custom data quality checks",
    parameters={'algorithm': 'logistic_regression'}
)

custom_pipeline = SimplePipeline(custom_config)

# Add standard steps
custom_pipeline.add_step(PipelineStep(
    name="data_loading",
    description="Load dataset",
    function=runner._data_loading_function
))

# Add custom quality check step
custom_pipeline.add_step(PipelineStep(
    name="data_quality_check",
    description="Perform custom data quality assessment",
    function=custom_data_quality_check
))

custom_pipeline.add_step(PipelineStep(
    name="data_preprocessing",
    description="Preprocess data",
    function=runner._preprocessing_function
))

custom_pipeline.add_step(PipelineStep(
    name="model_training",
    description="Train model",
    function=runner._training_function
))

print(f"üõ†Ô∏è  Custom pipeline created with {len(custom_pipeline.steps)} steps")

In [None]:
# Execute custom pipeline
print("üöÄ Executing custom pipeline with quality checks...")

custom_result = custom_pipeline.execute()

print(f"\n‚úÖ Custom pipeline completed: {custom_result.status}")
print(f"üìä Results:")
print(f"   - Steps: {custom_result.steps_completed}/{custom_result.steps_total}")
print(f"   - Success rate: {custom_result.success_rate:.1%}")
print(f"   - Duration: {custom_result.duration_seconds:.2f}s")

# Display quality metrics if available
if 'quality_metrics' in custom_result.outputs:
    quality_metrics = custom_result.outputs['quality_metrics']
    quality_score = custom_result.outputs['quality_score']
    
    print(f"\nüìè Data Quality Assessment:")
    print(f"   - Overall Quality Score: {quality_score:.1%}")
    print(f"   - Null Percentage: {quality_metrics['null_percentage']:.2f}%")
    print(f"   - Duplicate Percentage: {quality_metrics['duplicate_percentage']:.2f}%")
    print(f"   - Data Completeness: {quality_metrics['data_completeness']:.2f}%")
    print(f"   - Quality Check: {'‚úÖ PASSED' if custom_result.outputs['quality_passed'] else '‚ùå FAILED'}")
    
    # Visualize quality metrics
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Quality metrics bar chart
    metrics_names = list(quality_metrics.keys())
    metrics_values = list(quality_metrics.values())
    
    bars = ax1.bar(metrics_names, metrics_values, 
                   color=['#e74c3c', '#f39c12', '#2ecc71'])
    ax1.set_title('Data Quality Metrics', fontweight='bold')
    ax1.set_ylabel('Percentage (%)')
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{height:.1f}%', ha='center', va='bottom')
    
    # Quality score gauge
    colors = ['#e74c3c', '#f39c12', '#2ecc71']
    if quality_score < 0.6:
        color = colors[0]  # Red
    elif quality_score < 0.8:
        color = colors[1]  # Yellow
    else:
        color = colors[2]  # Green
    
    ax2.pie([quality_score, 1-quality_score], 
           colors=[color, '#ecf0f1'],
           startangle=90,
           counterclock=False,
           wedgeprops={'width': 0.3})
    
    ax2.text(0, 0, f'{quality_score:.1%}', 
            ha='center', va='center', fontsize=20, fontweight='bold')
    ax2.set_title('Overall Quality Score', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

## 8. Pipeline Summary and Next Steps

In [None]:
# Final summary
print("üéØ Pipeline Orchestration Summary")
print("=" * 50)

total_pipelines = len(runner.list_pipelines()) + 1  # +1 for custom pipeline
successful_pipelines = sum(1 for name in runner.list_pipelines() 
                          if runner.get_pipeline_result(name) and 
                          runner.get_pipeline_result(name).status == 'completed')
successful_pipelines += 1 if custom_result.status == 'completed' else 0

print(f"üìä Pipeline Execution Statistics:")
print(f"   ‚Ä¢ Total pipelines executed: {total_pipelines}")
print(f"   ‚Ä¢ Successful executions: {successful_pipelines}")
print(f"   ‚Ä¢ Success rate: {(successful_pipelines/total_pipelines):.1%}")

print(f"\nüöÄ Pipeline Types Demonstrated:")
print(f"   ‚úÖ Training Pipeline - Data ‚Üí Model")
print(f"   ‚úÖ Deployment Pipeline - Model ‚Üí Production")
print(f"   ‚úÖ Full MLOps Pipeline - End-to-end workflow")
print(f"   ‚úÖ Custom Pipeline - Quality checks & validation")

print(f"\nüõ†Ô∏è  Features Demonstrated:")
print(f"   ‚úÖ Simple local pipeline execution")
print(f"   ‚úÖ Step-by-step progress tracking")
print(f"   ‚úÖ Error handling and retries")
print(f"   ‚úÖ Pipeline performance monitoring")
print(f"   ‚úÖ Custom component integration")
print(f"   ‚úÖ Data quality assessment")
print(f"   ‚úÖ Comprehensive visualization")

print(f"\nüéØ Next Steps:")
print(f"   1. Integrate with Vertex AI Pipelines for cloud execution")
print(f"   2. Add pipeline scheduling and automation")
print(f"   3. Implement pipeline versioning and rollback")
print(f"   4. Add comprehensive monitoring and alerting")
print(f"   5. Create reusable component library")
print(f"   6. Implement CI/CD integration")

print(f"\n‚ú® Pipeline orchestration demonstration complete!")