In [None]:
# Azure Databricks 2-Hour Crash Course

## 🎯 Learning Objectives (2 Hours Total)
- **30 min:** Unity Catalog fundamentals and three-level namespace
- **45 min:** Delta Lake creation, time travel, and MERGE operations
- **30 min:** Data Engineering basics (Jobs, Workflows, External Storage)
- **15 min:** Integration patterns for your solution accelerator

## 📚 Prerequisites
- Azure Databricks workspace access
- Basic SQL and Python knowledge
- Sample data available

---

# Part 1: Unity Catalog Crash Course (30 minutes)
</VSCode.Cell>
<VSCode.Cell id="#VSC-354e3eea" language="python">
# 🏛️ PART 1A: UNITY CATALOG EXPLORATION (10 minutes)
print("🏛️ UNITY CATALOG EXPLORATION")
print("="*50)

# Check current catalog context
print("\n📍 Current Catalog Context:")
try:
    print(f"Current Catalog: {spark.sql('SELECT current_catalog()').collect()[0][0]}")
    print(f"Current Schema: {spark.sql('SELECT current_schema()').collect()[0][0]}")
except Exception as e:
    print(f"❌ Error getting current context: {e}")
    print("💡 Unity Catalog may not be enabled")

# List all catalogs
print("\n📚 Available Catalogs:")
try:
    catalogs = spark.sql("SHOW CATALOGS").collect()
    for catalog in catalogs:
        print(f"  📁 {catalog.catalog}")
except Exception as e:
    print(f"❌ Error listing catalogs: {e}")

# List schemas in main catalog
print("\n📋 Schemas in 'main' catalog:")
try:
    schemas = spark.sql("SHOW SCHEMAS IN CATALOG main").collect()
    for schema in schemas:
        print(f"  📂 main.{schema.databaseName}")
except Exception as e:
    print(f"❌ Error: {e}")
    print("💡 Trying default schema listing...")
    try:
        schemas = spark.sql("SHOW SCHEMAS").collect()
        for schema in schemas:
            print(f"  📂 {schema.databaseName}")
    except Exception as e2:
        print(f"❌ Error with default schemas: {e2}")
</VSCode.Cell>
<VSCode.Cell id="#VSC-8fa29ab2" language="python">
# 🏗️ PART 1B: CREATE YOUR OWN CATALOG STRUCTURE (10 minutes)
print("\n🏗️ CREATING CATALOG STRUCTURE FOR SOLUTION ACCELERATOR")
print("="*50)

# Create a catalog for your solution accelerator (if permissions allow)
catalog_name = "solution_accelerator"
schema_names = ["bronze", "silver", "gold"]

print(f"\n📁 Attempting to create catalog: {catalog_name}")
try:
    spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
    print("  ✅ Catalog created/exists")
    current_catalog = catalog_name
    
    # Create schemas for medallion architecture
    for schema in schema_names:
        print(f"\n📂 Creating schema: {catalog_name}.{schema}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema}")
        print(f"  ✅ Schema {catalog_name}.{schema} created/exists")
        
    # Set working catalog
    spark.sql(f"USE CATALOG {catalog_name}")
    print(f"\n✅ Now working in catalog: {catalog_name}")
    
except Exception as e:
    print(f"  ❌ Cannot create catalog (permissions): {e}")
    print("  💡 Using 'main' catalog instead")
    current_catalog = "main"
    
    # Try to create schemas in main catalog or use default
    for schema in schema_names:
        try:
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
            print(f"  ✅ Schema {schema} created in main catalog")
        except Exception as e:
            print(f"  ⚠️ Could not create schema {schema}: {e}")
            print(f"     Will use 'default' schema")

print(f"\n📍 Working Context:")
try:
    print(f"   Catalog: {spark.sql('SELECT current_catalog()').collect()[0][0]}")
    print(f"   Schema:  {spark.sql('SELECT current_schema()').collect()[0][0]}")
except:
    print(f"   Using default context")
</VSCode.Cell>
<VSCode.Cell id="#VSC-c8de4372" language="python">
# 🔍 PART 1C: THREE-LEVEL NAMESPACE DEMO (10 minutes)
print("\n🔍 THREE-LEVEL NAMESPACE DEMONSTRATION")
print("="*50)

# Demonstrate three-level namespace: catalog.schema.table
try:
    current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
    current_schema = spark.sql("SELECT current_schema()").collect()[0][0]
except:
    current_catalog = "main"
    current_schema = "default"

print(f"📍 Three-Level Namespace Structure:")
print(f"   Catalog: {current_catalog}")
print(f"   Schema:  bronze, silver, gold (or {current_schema})")
print(f"   Table:   customer, product, orders")
print(f"   Full:    {current_catalog}.{current_schema}.customer")

# List tables in current schema
print(f"\n📊 Tables in current schema:")
try:
    tables = spark.sql("SHOW TABLES").collect()
    if tables:
        for table in tables[:5]:  # Show first 5
            print(f"   📋 {table.tableName}")
    else:
        print("   📭 No tables found (expected for new schemas)")
except Exception as e:
    print(f"   ❌ Error listing tables: {e}")

print("\n💡 Key Unity Catalog Concepts:")
print("   • Catalogs = Top-level containers (like databases)")
print("   • Schemas = Logical groupings within catalogs")  
print("   • Tables = Data assets within schemas")
print("   • Full path: catalog.schema.table")
print("   • Unity Catalog provides governance, lineage, and access control")
</VSCode.Cell>
<VSCode.Cell id="#VSC-c3721e4c" language="markdown">
---

# Part 2: Delta Lake Deep Dive (45 minutes)

## What is Delta Lake?
- **ACID transactions** on data lakes
- **Time travel** - query historical versions
- **Schema evolution** - safely modify table structure
- **Optimizations** - Z-ordering, auto-compaction
- **Merges/Upserts** - efficiently update data
</VSCode.Cell>
<VSCode.Cell id="#VSC-ce8a9ba3" language="python">
# 🚀 PART 2A: CREATE SAMPLE DATA AND DELTA TABLES (15 minutes)
print("🚀 CREATING SAMPLE DATA FOR DELTA LAKE DEMO")
print("="*50)

# Create sample customer data
import pandas as pd
from datetime import datetime, timedelta
import random

# Set seed for reproducible results
random.seed(42)

# Generate sample data
customers_data = []
cities = ['Seattle', 'Portland', 'San Francisco', 'Los Angeles', 'Denver', 'Austin', 'Chicago', 'Boston']
statuses = ['Active', 'Inactive', 'Pending', 'VIP']

for i in range(100):
    customers_data.append({
        'customer_id': i + 1,
        'customer_name': f'Customer_{i+1:03d}',
        'email': f'customer{i+1}@example.com',
        'city': random.choice(cities),
        'registration_date': datetime.now() - timedelta(days=random.randint(1, 365)),
        'status': random.choice(statuses),
        'annual_spend': round(random.uniform(1000, 10000), 2)
    })

# Convert to Spark DataFrame
try:
    customers_df = spark.createDataFrame(customers_data)
    print("📊 Sample Customer Data Created:")
    customers_df.show(5)
    print(f"   Total Records: {customers_df.count()}")
    print("   ✅ Sample data generation successful")
except Exception as e:
    print(f"❌ Error creating sample data: {e}")
    # Create a simple fallback dataset
    print("Creating fallback dataset...")
    simple_data = [(1, "Customer_001", "test@example.com", "Seattle", datetime.now(), "Active", 5000.0)]
    customers_df = spark.createDataFrame(simple_data, 
        ['customer_id', 'customer_name', 'email', 'city', 'registration_date', 'status', 'annual_spend'])
    print("✅ Fallback data created")
</VSCode.Cell>
<VSCode.Cell id="#VSC-a3299514" language="python">
# 🏗️ PART 2B: CREATE DELTA TABLE (10 minutes)
print("\n🏗️ CREATING DELTA TABLE")
print("="*50)

# Set working schema
try:
    current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
    current_schema = spark.sql("SELECT current_schema()").collect()[0][0]
    
    # Try to use bronze schema if it exists
    try:
        spark.sql("USE SCHEMA bronze")
        current_schema = "bronze"
        print(f"✅ Using schema: {current_catalog}.bronze")
    except:
        print(f"⚠️ Using current schema: {current_catalog}.{current_schema}")
        
except Exception as e:
    print(f"⚠️ Using default context: {e}")
    current_catalog = "main"
    current_schema = "default"

# Create Delta table
table_name = "customers_delta"

print(f"\n📋 Creating Delta table: {table_name}")
print(f"   Full path: {current_catalog}.{current_schema}.{table_name}")

try:
    # Write as Delta table
    customers_df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(table_name)

    print("✅ Delta table created successfully!")

    # Verify table creation
    print(f"\n🔍 Table Information:")
    table_info = spark.sql(f"DESCRIBE EXTENDED {table_name}")
    print(f"   Rows: {customers_df.count()}")
    print(f"   Columns: {len(customers_df.columns)}")
    print(f"   Format: Delta Lake")
    
except Exception as e:
    print(f"❌ Error creating Delta table: {e}")
    print("💡 This might be due to permissions or catalog configuration")
    
    # Try alternative approach - create in temp location
    try:
        print("\n🔄 Trying alternative approach...")
        temp_path = "/tmp/customers_delta"
        customers_df.write.format("delta").mode("overwrite").save(temp_path)
        spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING DELTA LOCATION '{temp_path}'")
        print("✅ Delta table created in temp location")
    except Exception as e2:
        print(f"❌ Alternative approach failed: {e2}")
</VSCode.Cell>
<VSCode.Cell id="#VSC-3d4541d4" language="python">
# ⏰ PART 2C: DELTA LAKE TIME TRAVEL (10 minutes)
print("\n⏰ DELTA LAKE TIME TRAVEL DEMO")
print("="*50)

try:
    # Show initial version
    print("📊 Initial table state:")
    initial_count = spark.sql(f"SELECT COUNT(*) as count FROM {table_name}").collect()[0][0]
    print(f"   Records: {initial_count}")

    # Make some updates to create versions
    print("\n🔄 Creating new versions for time travel demo...")

    # Version 1: Update some customers to VIP status
    print("Version 1: Updating customer status to VIP...")
    spark.sql(f"""
        UPDATE {table_name} 
        SET status = 'VIP', annual_spend = annual_spend * 1.5
        WHERE customer_id <= 10
    """)
    
    vip_count_v1 = spark.sql(f"SELECT COUNT(*) as count FROM {table_name} WHERE status = 'VIP'").collect()[0][0]
    print(f"   VIP customers after update: {vip_count_v1}")

    # Version 2: Insert new customers  
    print("\nVersion 2: Adding new customers...")
    new_customers = spark.createDataFrame([
        (101, 'Customer_101', 'customer101@example.com', 'Boston', datetime.now(), 'Active', 7500.0),
        (102, 'Customer_102', 'customer102@example.com', 'Chicago', datetime.now(), 'Active', 8200.0)
    ], ['customer_id', 'customer_name', 'email', 'city', 'registration_date', 'status', 'annual_spend'])

    new_customers.write.format("delta").mode("append").saveAsTable(table_name)
    
    final_count = spark.sql(f"SELECT COUNT(*) as count FROM {table_name}").collect()[0][0]
    print(f"   Total customers after insert: {final_count}")

    # Show table history
    print("\n📚 Delta Table History:")
    try:
        history_df = spark.sql(f"DESCRIBE HISTORY {table_name}")
        history_df.select("version", "timestamp", "operation").show(truncate=False)
    except Exception as e:
        print(f"❌ Could not show history: {e}")

    # Time travel examples
    print("\n⏰ Time Travel Examples:")

    try:
        print("\n📸 Version 0 (original):")
        v0_result = spark.sql(f"""
            SELECT COUNT(*) as total_count, 
                   COUNT(CASE WHEN status='VIP' THEN 1 END) as vip_count 
            FROM {table_name} VERSION AS OF 0
        """).collect()[0]
        print(f"   Total: {v0_result[0]}, VIP: {v0_result[1]}")

        print("\n📸 Version 1 (after VIP updates):")
        v1_result = spark.sql(f"""
            SELECT COUNT(*) as total_count, 
                   COUNT(CASE WHEN status='VIP' THEN 1 END) as vip_count 
            FROM {table_name} VERSION AS OF 1
        """).collect()[0]
        print(f"   Total: {v1_result[0]}, VIP: {v1_result[1]}")

        print("\n📸 Current version:")
        current_result = spark.sql(f"""
            SELECT COUNT(*) as total_count, 
                   COUNT(CASE WHEN status='VIP' THEN 1 END) as vip_count 
            FROM {table_name}
        """).collect()[0]
        print(f"   Total: {current_result[0]}, VIP: {current_result[1]}")
        
    except Exception as e:
        print(f"❌ Time travel queries failed: {e}")
        print("💡 Time travel might not be available in this environment")

except Exception as e:
    print(f"❌ Error in time travel demo: {e}")
    print("💡 Make sure the Delta table was created successfully first")
</VSCode.Cell>
<VSCode.Cell id="#VSC-d63f8880" language="python">
# 🔀 PART 2D: DELTA MERGE OPERATIONS (10 minutes)
print("\n🔀 DELTA MERGE OPERATIONS DEMO")
print("="*50)

try:
    # Create updates dataset
    updates_data = [
        (5, 'Customer_005_Updated', 'updated5@example.com', 'Updated_City', datetime.now(), 'Premium', 12000.0),
        (10, 'Customer_010_Updated', 'updated10@example.com', 'Updated_City', datetime.now(), 'Premium', 15000.0),
        (103, 'Customer_103_New', 'customer103@example.com', 'Miami', datetime.now(), 'Active', 9500.0)  # New customer
    ]

    updates_df = spark.createDataFrame(updates_data, 
        ['customer_id', 'customer_name', 'email', 'city', 'registration_date', 'status', 'annual_spend'])

    print("📊 Updates to apply:")
    updates_df.show()

    # Create temporary view for MERGE
    updates_df.createOrReplaceTempView("customer_updates")

    # Perform MERGE operation
    print("\n🔀 Executing MERGE operation...")
    merge_sql = f"""
    MERGE INTO {table_name} as target
    USING customer_updates as source
    ON target.customer_id = source.customer_id
    WHEN MATCHED THEN
        UPDATE SET 
            customer_name = source.customer_name,
            email = source.email,
            city = source.city,
            status = source.status,
            annual_spend = source.annual_spend
    WHEN NOT MATCHED THEN
        INSERT (customer_id, customer_name, email, city, registration_date, status, annual_spend)
        VALUES (source.customer_id, source.customer_name, source.email, source.city, source.registration_date, source.status, source.annual_spend)
    """

    spark.sql(merge_sql)
    print("✅ MERGE operation completed!")

    # Verify results
    print("\n📊 Results after MERGE:")
    result_df = spark.sql(f"""
        SELECT customer_id, customer_name, city, status, annual_spend
        FROM {table_name} 
        WHERE customer_id IN (5, 10, 103)
        ORDER BY customer_id
    """)
    result_df.show()

    print("\n📈 Total count after MERGE:")
    final_count = spark.sql(f"SELECT COUNT(*) as total_customers FROM {table_name}").collect()[0][0]
    print(f"   Total customers: {final_count}")
    
    print("\n💰 Premium customers summary:")
    premium_summary = spark.sql(f"""
        SELECT status, COUNT(*) as count, AVG(annual_spend) as avg_spend
        FROM {table_name}
        WHERE status IN ('Premium', 'VIP')
        GROUP BY status
        ORDER BY avg_spend DESC
    """)
    premium_summary.show()

except Exception as e:
    print(f"❌ Error in MERGE operation: {e}")
    print("💡 MERGE operations require Delta Lake table format")
</VSCode.Cell>
<VSCode.Cell id="#VSC-7fddcbf9" language="markdown">
---

# Part 3: Data Engineering Basics (30 minutes)

## Key Concepts:
- **Notebooks → Jobs** (scheduled execution)
- **Workflows** (multi-task orchestration)  
- **External Storage** (Azure Data Lake integration)
- **Delta Live Tables** (streaming ETL)
</VSCode.Cell>
<VSCode.Cell id="#VSC-10a1372e" language="python">
# 🔧 PART 3A: EXTERNAL STORAGE CONNECTION (10 minutes)
print("🔧 EXTERNAL STORAGE CONNECTION")
print("="*50)

# Check existing mounts
print("📎 Current mounted storage:")
try:
    mounts = dbutils.fs.mounts()
    if mounts:
        for mount in mounts:
            print(f"   🔗 {mount.mountPoint} → {mount.source}")
    else:
        print("   📭 No external storage currently mounted")
except Exception as e:
    print(f"❌ Error checking mounts: {e}")

# Demo: How to mount Azure Data Lake (conceptual - requires credentials)
print(f"\n💡 Mounting Azure Data Lake Storage (ADLS) - Conceptual Example:")
mount_example = '''
# Mount ADLS Gen2 for your solution accelerator
dbutils.fs.mount(
    source = "abfss://container@storageaccount.dfs.core.windows.net/",
    mount_point = "/mnt/solution-accelerator",
    extra_configs = {
        "fs.azure.account.auth.type.storageaccount.dfs.core.windows.net": "OAuth",
        "fs.azure.account.oauth.provider.type.storageaccount.dfs.core.windows.net": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
        "fs.azure.account.oauth2.client.id.storageaccount.dfs.core.windows.net": "application-id",
        "fs.azure.account.oauth2.client.secret.storageaccount.dfs.core.windows.net": "service-credential-key"
    }
)
'''
print(mount_example)

# Show how to work with mounted storage
print("\n📁 Working with mounted storage:")
print("   • List files: dbutils.fs.ls('/mnt/solution-accelerator/')")
print("   • Read CSV: spark.read.csv('/mnt/solution-accelerator/data.csv')")
print("   • Write Delta: df.write.format('delta').save('/mnt/solution-accelerator/delta-table')")

# For your solution accelerator integration:
print(f"\n🎯 For Your Solution Accelerator:")
print("   1. Mount Azure Data Lake where Fabric can access")
print("   2. Write Delta tables to mounted location")  
print("   3. Fabric reads from same Azure Data Lake location")
print("   4. Creates seamless Databricks → Fabric pipeline")

# Demonstrate file operations that work without mounts
print(f"\n🛠️ Basic file operations (no mount required):")
try:
    # Create a sample file in DBFS
    sample_path = "/tmp/sample_data.csv"
    sample_content = "id,name,value\n1,Sample,100\n2,Test,200"
    dbutils.fs.put(sample_path, sample_content, overwrite=True)
    print(f"   ✅ Created sample file: {sample_path}")
    
    # Read it back as DataFrame
    df_sample = spark.read.csv(sample_path, header=True, inferSchema=True)
    df_sample.show()
    
    # Clean up
    dbutils.fs.rm(sample_path)
    print("   ✅ Sample file removed")
    
except Exception as e:
    print(f"   ❌ Error in file operations: {e}")
</VSCode.Cell>
<VSCode.Cell id="#VSC-26a8e5f7" language="python">
# 📊 PART 3B: JOBS AND WORKFLOWS CONCEPTS (10 minutes)
print("\n📊 JOBS AND WORKFLOWS CONCEPTS")
print("="*50)

print("🔄 Converting Notebooks to Jobs:")
print("   1. Save your notebook (this one!)")
print("   2. Go to 'Workflows' in Databricks sidebar")
print("   3. Click 'Create Job'")
print("   4. Select this notebook as the task")
print("   5. Configure cluster and schedule")

print(f"\n⚙️ Job Configuration Example for Your Solution Accelerator:")
job_config = '''
Job Name: "Solution Accelerator Data Pipeline"
Tasks:
├── Task 1: "Data Ingestion" 
│   ├── Notebook: /Users/your-email/Data_Ingestion.ipynb
│   ├── Cluster: Job cluster (auto-terminating)
│   ├── Libraries: Delta Lake, pandas
│   └── Schedule: Daily at 2 AM
├── Task 2: "Data Transformation" (depends on Task 1)
│   ├── Notebook: /Users/your-email/Data_Transformation.ipynb  
│   ├── Parameters: {"source_table": "bronze.customers"}
│   └── Timeout: 30 minutes
└── Task 3: "Data Quality Check" (depends on Task 2)
    ├── Notebook: /Users/your-email/Data_Quality.ipynb
    ├── Alerts: Email on failure
    └── Retry: 3 attempts with exponential backoff
'''
print(job_config)

print(f"\n📈 Workflow Benefits:")
print("   ✅ Automated execution (no manual intervention)")
print("   ✅ Dependency management (tasks run in order)") 
print("   ✅ Error handling and retries (resilient)")
print("   ✅ Monitoring and alerts (proactive)")
print("   ✅ Parameter passing between tasks (flexible)")
print("   ✅ Cost optimization (clusters auto-terminate)")

print(f"\n🎯 Real-world Use Cases:")
print("   • Daily data ingestion from source systems")
print("   • ETL pipelines with quality checks")
print("   • Model training and deployment")
print("   • Report generation and distribution")
print("   • Data validation and monitoring")

# Demonstrate job parameters concept
print(f"\n📝 Job Parameters Example:")
print("   In this notebook, you could access job parameters like:")
print("   source_date = dbutils.widgets.get('source_date')")
print("   table_name = dbutils.widgets.get('table_name')")
print("   environment = dbutils.widgets.get('environment')")

# Show current notebook info that would be useful for jobs
try:
    context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
    print(f"\n📍 Current Notebook Context:")
    print(f"   User: {context.tags().get('user').get()}")
    print(f"   Notebook: {context.notebookPath().get()}")
except:
    print(f"\n📍 Notebook context not available in this environment")
</VSCode.Cell>
<VSCode.Cell id="#VSC-a0f12e84" language="python">
# 🌊 PART 3C: DELTA LIVE TABLES OVERVIEW (10 minutes)
print("\n🌊 DELTA LIVE TABLES (DLT) OVERVIEW")
print("="*50)

print("💡 What is Delta Live Tables?")
print("   • Declarative ETL framework")
print("   • Automatically manages dependencies")
print("   • Built-in data quality monitoring")  
print("   • Streaming and batch processing")
print("   • Automatic schema evolution")
print("   • Cost optimization with auto-scaling")

print(f"\n📝 DLT Example for Your Solution Accelerator:")
dlt_example = '''
import dlt
from pyspark.sql.functions import *

# Bronze Layer - Raw data ingestion
@dlt.table(
    comment="Raw customer data from source systems",
    table_properties={"quality": "bronze"}
)
def bronze_customers():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", "/mnt/schema/customers")
        .load("/mnt/raw-data/customers/")
    )

# Silver Layer - Cleaned and validated data  
@dlt.table(
    comment="Cleaned customer data with quality checks",
    table_properties={"quality": "silver"}
)
@dlt.expect("valid_email", "email IS NOT NULL AND email RLIKE '^[^@]+@[^@]+\\\\.[^@]+$'")
@dlt.expect_or_drop("customer_id_not_null", "customer_id IS NOT NULL")
@dlt.expect_or_fail("reasonable_annual_spend", "annual_spend >= 0 AND annual_spend <= 100000")
def silver_customers():
    return (
        dlt.read("bronze_customers")
        .select(
            col("customer_id").cast("int"),
            col("customer_name"), 
            col("email"), 
            col("city"), 
            col("registration_date").cast("date"),
            col("status"),
            col("annual_spend").cast("double")
        )
        .filter("customer_id IS NOT NULL")
        .withColumn("processed_timestamp", current_timestamp())
    )

# Gold Layer - Business-ready aggregated data
@dlt.table(
    comment="Customer metrics for business analytics",
    table_properties={"quality": "gold"}
)
def gold_customer_metrics():
    return (
        dlt.read("silver_customers")
        .groupBy("city", "status")
        .agg(
            count("customer_id").alias("customer_count"),
            avg("annual_spend").alias("avg_annual_spend"),
            max("registration_date").alias("latest_registration"),
            sum("annual_spend").alias("total_revenue")
        )
        .withColumn("metric_date", current_date())
    )
'''
print(dlt_example)

print(f"\n🎯 DLT Benefits for Your Architecture:")
print("   ✅ Automatic Bronze → Silver → Gold pipeline")
print("   ✅ Built-in data quality monitoring and alerts")
print("   ✅ Real-time streaming capability")
print("   ✅ Automatic dependency resolution")
print("   ✅ Complete lineage tracking")
print("   ✅ Schema evolution without breaking pipelines")
print("   ✅ Automatic optimization and performance tuning")

print(f"\n🏗️ DLT vs Traditional ETL:")
comparison = '''
Traditional ETL:                    Delta Live Tables:
├── Manual dependency management    ├── Automatic dependency resolution
├── Custom error handling          ├── Built-in quality expectations  
├── Manual schema management       ├── Automatic schema evolution
├── Complex orchestration          ├── Declarative pipeline definition
├── Manual monitoring              ├── Built-in observability
└── Infrastructure management      └── Serverless execution
'''
print(comparison)

print(f"\n🚀 When to Use DLT:")
print("   ✅ Complex ETL pipelines with multiple stages")
print("   ✅ Streaming data processing requirements")
print("   ✅ Strict data quality requirements")
print("   ✅ Need for automatic dependency management")
print("   ✅ Production-grade data pipelines")
</VSCode.Cell>
<VSCode.Cell id="#VSC-b30275c0" language="markdown">
---

# Part 4: Solution Accelerator Integration (15 minutes)

## How This Applies to Your Project
</VSCode.Cell>
<VSCode.Cell id="#VSC-8636aedc" language="python">
# 🎯 PART 4: SOLUTION ACCELERATOR INTEGRATION PATTERNS
print("🎯 SOLUTION ACCELERATOR INTEGRATION PATTERNS")
print("="*50)

print("🏗️ Your Architecture Integration:")
print("   Azure Databricks (Online Channel) → Azure Data Lake → Fabric Bronze → Silver → Gold")

print(f"\n📋 Implementation Roadmap:")
roadmap = '''
Week 1-2: Databricks Setup
├── ✅ Create Unity Catalog structure (solution_accelerator.bronze/silver/gold)
├── ✅ Generate sample data in Delta format  
├── 🔄 Set up external storage mounts to Azure Data Lake
└── 🔄 Create basic ETL notebooks

Week 3-4: Data Pipeline Development  
├── 🔄 Build Bronze → Silver transformation jobs
├── 🔄 Implement data quality checks with Delta
├── 🔄 Set up automated workflows
└── 🔄 Create monitoring and alerting

Week 5-6: Fabric Integration
├── 🔄 Configure Fabric to read from same Azure Data Lake
├── 🔄 Implement cross-channel data merging in Fabric Silver tier
├── 🔄 Build Gold tier analytics in Fabric
└── 🔄 Create Power BI dashboards

Week 7-8: Production Readiness
├── 🔄 Implement Delta Live Tables for streaming
├── 🔄 Set up automated deployment pipelines  
├── 🔄 Add comprehensive monitoring
└── 🔄 Document integration patterns
'''
print(roadmap)

# Show what you've accomplished in this crash course
print(f"\n🎉 What You've Accomplished Today:")
print("   ✅ Understanding Unity Catalog three-level namespace")
print("   ✅ Creating Delta tables with sample data")
print("   ✅ Practicing time travel and MERGE operations")
print("   ✅ Learning about Jobs, Workflows, and DLT")
print("   ✅ Understanding integration patterns for your project")
</VSCode.Cell>
<VSCode.Cell id="#VSC-59b70778" language="python">
# 📊 PRACTICAL NEXT STEPS FOR YOUR PROJECT
print("\n📊 PRACTICAL NEXT STEPS FOR YOUR PROJECT")
print("="*50)

print("🔧 Immediate Actions (This Week):")
print("   1. ✅ Complete this crash course notebook")
print("   2. Import your existing product generation notebook")
print("   3. Convert generated data to Delta format")
print("   4. Practice with Unity Catalog structure")
print("   5. Create customer and order generation notebooks")

print(f"\n🚀 Next Week Actions:")
print("   1. Set up Azure Data Lake mount point")
print("   2. Create automated jobs for data processing")
print("   3. Build basic Bronze → Silver transformation")
print("   4. Test cross-channel data integration")
print("   5. Set up monitoring and quality checks")

print(f"\n📈 Integration Strategy:")
print("   1. Use Delta Lake as the reliable foundation")
print("   2. Leverage Unity Catalog for governance")
print("   3. External mounts for Fabric integration")
print("   4. Jobs/Workflows for automation")
print("   5. DLT for advanced production pipelines")

print(f"\n💡 Key Success Patterns for Your Solution Accelerator:")
patterns = '''
✅ Unity Catalog Governance:
   • Use catalog.schema.table naming consistently
   • Set up proper permissions and stewardship
   • Document business glossary terms

✅ Delta Lake Foundation:
   • All tables in Delta format for ACID transactions
   • Use time travel for debugging and auditing
   • Implement MERGE for efficient updates

✅ External Integration:
   • Mount Azure Data Lake for Fabric connectivity
   • Use consistent file formats across platforms
   • Implement proper error handling and monitoring

✅ Automation Ready:
   • Design notebooks for job conversion
   • Use parameters for flexibility
   • Implement proper logging and alerts

✅ Production Patterns:
   • Use DLT for complex pipelines
   • Implement comprehensive data quality checks
   • Set up proper monitoring and governance
'''
print(patterns)
</VSCode.Cell>
<VSCode.Cell id="#VSC-aedc9ce1" language="python">
# 🎓 KNOWLEDGE CHECK AND SUMMARY
print("\n🎓 FINAL KNOWLEDGE CHECK - What You've Learned")
print("="*50)

print("📚 Unity Catalog Mastery (30 min):")
print("   ✅ Three-level namespace: catalog.schema.table")
print("   ✅ Create catalogs and schemas for medallion architecture")
print("   ✅ Understanding data governance and permissions")
print("   ✅ Integration with solution accelerator structure")

print(f"\n🚀 Delta Lake Expertise (45 min):")
print("   ✅ Create Delta tables from sample data")
print("   ✅ Time travel and versioning capabilities")
print("   ✅ MERGE operations for upserts and data management") 
print("   ✅ ACID transactions and optimization features")
print("   ✅ Schema evolution and data quality")

print(f"\n🔧 Data Engineering Skills (30 min):")
print("   ✅ Convert notebooks to automated jobs")
print("   ✅ Set up workflows with dependencies")
print("   ✅ External storage integration patterns")
print("   ✅ Delta Live Tables for advanced ETL")
print("   ✅ Production-ready pipeline design")

print(f"\n🎯 Solution Accelerator Ready:")
print("   ✅ Technical foundation established")
print("   ✅ Integration patterns understood")
print("   ✅ Production-ready architecture knowledge")
print("   ✅ Hands-on experience with enterprise features")

print(f"\n📊 Performance Summary:")
print("   🎯 Target: 2 hours comprehensive learning")
print("   ✅ Unity Catalog: 30 minutes")
print("   ✅ Delta Lake: 45 minutes") 
print("   ✅ Data Engineering: 30 minutes")
print("   ✅ Integration: 15 minutes")
print("   🎉 Total: 2 hours of intensive, practical learning!")

print(f"\n🚀 You're Now Ready To:")
print("   • Build enterprise-grade data pipelines")
print("   • Implement your solution accelerator architecture")
print("   • Integrate Databricks with Microsoft Fabric")
print("   • Create production-ready automated workflows")
print("   • Apply best practices for data governance")

# Clean up demo data (optional)
print(f"\n🗑️ Cleanup Options:")
print("   To remove demo table: DROP TABLE IF EXISTS customers_delta")
print("   To keep for practice: Table will remain for further exploration")

try:
    table_exists = spark.sql("SHOW TABLES LIKE 'customers_delta'").count() > 0
    if table_exists:
        print("   ✅ Demo table 'customers_delta' is available for continued practice")
    else:
        print("   ℹ️ Demo table was not created or is not visible")
except:
    print("   ℹ️ Unable to check table status")

print(f"\n🎊 CONGRATULATIONS!")
print("You've completed the Azure Databricks 2-Hour Crash Course!")
print("Ready to build amazing enterprise data solutions! 🚀")
</VSCode.Cell>

# Azure Databricks 2-Hour Crash Course

## 🎯 Learning Objectives (2 Hours Total)
- **30 min:** Unity Catalog fundamentals and three-level namespace
- **45 min:** Delta Lake creation, time travel, and MERGE operations
- **30 min:** Data Engineering basics (Jobs, Workflows, External Storage)
- **15 min:** Integration patterns for your solution accelerator

## 📚 Prerequisites
- Azure Databricks workspace access
- Basic SQL and Python knowledge
- Sample data available

---

# Part 1: Unity Catalog Crash Course (30 minutes)

In [None]:
# 🏛️ PART 1A: UNITY CATALOG EXPLORATION (10 minutes)
print("🏛️ UNITY CATALOG EXPLORATION")
print("="*50)

# Check current catalog context
print("\n📍 Current Catalog Context:")
print(f"Current Catalog: {spark.sql('SELECT current_catalog()').collect()[0][0]}")
print(f"Current Schema: {spark.sql('SELECT current_schema()').collect()[0][0]}")

# List all catalogs
print("\n📚 Available Catalogs:")
catalogs = spark.sql("SHOW CATALOGS").collect()
for catalog in catalogs:
    print(f"  📁 {catalog.catalog}")

# List schemas in main catalog
print("\n📋 Schemas in 'main' catalog:")
try:
    schemas = spark.sql("SHOW SCHEMAS IN CATALOG main").collect()
    for schema in schemas:
        print(f"  📂 main.{schema.databaseName}")
except Exception as e:
    print(f"  ❌ Error: {e}")
    print("  💡 Unity Catalog may not be enabled or you may not have access")

In [None]:
# 🏗️ PART 1B: CREATE YOUR OWN CATALOG STRUCTURE (10 minutes)
print("\n🏗️ CREATING CATALOG STRUCTURE FOR SOLUTION ACCELERATOR")
print("="*50)

# Create a catalog for your solution accelerator (if permissions allow)
catalog_name = "solution_accelerator"
schema_names = ["bronze", "silver", "gold"]

print(f"\n📁 Creating catalog: {catalog_name}")
try:
    spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
    print("  ✅ Catalog created/exists")
    
    # Create schemas for medallion architecture
    for schema in schema_names:
        print(f"\n📂 Creating schema: {catalog_name}.{schema}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema}")
        print(f"  ✅ Schema {catalog_name}.{schema} created/exists")
        
    # Set working catalog
    spark.sql(f"USE CATALOG {catalog_name}")
    print(f"\n✅ Now working in catalog: {catalog_name}")
    
except Exception as e:
    print(f"  ❌ Cannot create catalog (permissions): {e}")
    print("  💡 Using 'main' catalog instead")
    catalog_name = "main"
    
    # Try to create schemas in main catalog
    for schema in schema_names:
        try:
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
            print(f"  ✅ Schema {schema} created in main catalog")
        except Exception as e:
            print(f"  ⚠️ Could not create schema {schema}: {e}")

In [None]:
# 🔍 PART 1C: THREE-LEVEL NAMESPACE DEMO (10 minutes)
print("\n🔍 THREE-LEVEL NAMESPACE DEMONSTRATION")
print("="*50)

# Demonstrate three-level namespace: catalog.schema.table
current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]

print(f"📍 Three-Level Namespace Structure:")
print(f"   Catalog: {current_catalog}")
print(f"   Schema:  bronze, silver, gold")
print(f"   Table:   customer, product, orders")
print(f"   Full:    {current_catalog}.bronze.customer")

# Show current context
print(f"\n📋 Current Working Context:")
print(f"   Catalog: {spark.sql('SELECT current_catalog()').collect()[0][0]}")
print(f"   Schema:  {spark.sql('SELECT current_schema()').collect()[0][0]}")

# List tables in current schema
print(f"\n📊 Tables in current schema:")
try:
    tables = spark.sql("SHOW TABLES").collect()
    if tables:
        for table in tables[:5]:  # Show first 5
            print(f"   📋 {table.tableName}")
    else:
        print("   📭 No tables found (expected for new schemas)")
except Exception as e:
    print(f"   ❌ Error listing tables: {e}")

print("\n💡 Key Unity Catalog Concepts:")
print("   • Catalogs = Top-level containers (like databases)")
print("   • Schemas = Logical groupings within catalogs")  
print("   • Tables = Data assets within schemas")
print("   • Full path: catalog.schema.table")

---

# Part 2: Delta Lake Deep Dive (45 minutes)

## What is Delta Lake?
- **ACID transactions** on data lakes
- **Time travel** - query historical versions
- **Schema evolution** - safely modify table structure
- **Optimizations** - Z-ordering, auto-compaction
- **Merges/Upserts** - efficiently update data

In [None]:
# 🚀 PART 2A: CREATE SAMPLE DATA AND DELTA TABLES (15 minutes)
print("🚀 CREATING SAMPLE DATA FOR DELTA LAKE DEMO")
print("="*50)

# Create sample customer data
import pandas as pd
from datetime import datetime, timedelta
import random

# Generate sample data
customers_data = []
for i in range(100):
    customers_data.append({
        'customer_id': i + 1,
        'customer_name': f'Customer_{i+1:03d}',
        'email': f'customer{i+1}@example.com',
        'city': random.choice(['Seattle', 'Portland', 'San Francisco', 'Los Angeles', 'Denver']),
        'registration_date': datetime.now() - timedelta(days=random.randint(1, 365)),
        'status': random.choice(['Active', 'Inactive', 'Pending'])
    })

# Convert to Spark DataFrame
customers_df = spark.createDataFrame(customers_data)

print("📊 Sample Customer Data Created:")
customers_df.show(5)
print(f"   Total Records: {customers_df.count()}")

In [None]:
# 🏗️ PART 2B: CREATE DELTA TABLE (10 minutes)
print("\n🏗️ CREATING DELTA TABLE")
print("="*50)

# Set working schema
current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
schema_name = "bronze" if current_catalog != "main" else "default"

try:
    spark.sql(f"USE SCHEMA {schema_name}")
    print(f"✅ Using schema: {current_catalog}.{schema_name}")
except:
    print(f"⚠️ Using default schema")

# Create Delta table
table_name = "customers_delta"
full_table_name = f"{current_catalog}.{schema_name}.{table_name}"

print(f"\n📋 Creating Delta table: {full_table_name}")

# Write as Delta table
customers_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(table_name)

print("✅ Delta table created successfully!")

# Verify table creation
print(f"\n🔍 Table Information:")
spark.sql(f"DESCRIBE EXTENDED {table_name}").show(10, truncate=False)

In [None]:
# ⏰ PART 2C: DELTA LAKE TIME TRAVEL (10 minutes)
print("\n⏰ DELTA LAKE TIME TRAVEL DEMO")
print("="*50)

# Show initial version
print("📊 Initial table state:")
spark.sql(f"SELECT COUNT(*) as count FROM {table_name}").show()

# Make some updates to create versions
print("\n🔄 Creating new versions...")

# Version 1: Update some customers
print("Version 1: Updating customer status...")
spark.sql(f"""
    UPDATE {table_name} 
    SET status = 'VIP' 
    WHERE customer_id <= 10
""")

# Version 2: Insert new customers  
print("Version 2: Adding new customers...")
new_customers = spark.createDataFrame([
    (101, 'Customer_101', 'customer101@example.com', 'Boston', datetime.now(), 'Active'),
    (102, 'Customer_102', 'customer102@example.com', 'Chicago', datetime.now(), 'Active')
], ['customer_id', 'customer_name', 'email', 'city', 'registration_date', 'status'])

new_customers.write.format("delta").mode("append").saveAsTable(table_name)

# Show table history
print("\n📚 Delta Table History:")
history_df = spark.sql(f"DESCRIBE HISTORY {table_name}")
history_df.select("version", "timestamp", "operation", "operationParameters").show(truncate=False)

# Time travel examples
print("\n⏰ Time Travel Examples:")

print("\n📸 Version 0 (original):")
spark.sql(f"SELECT COUNT(*) as count, COUNT(CASE WHEN status='VIP' THEN 1 END) as vip_count FROM {table_name} VERSION AS OF 0").show()

print("\n📸 Version 1 (after VIP update):")
spark.sql(f"SELECT COUNT(*) as count, COUNT(CASE WHEN status='VIP' THEN 1 END) as vip_count FROM {table_name} VERSION AS OF 1").show()

print("\n📸 Current version:")
spark.sql(f"SELECT COUNT(*) as count, COUNT(CASE WHEN status='VIP' THEN 1 END) as vip_count FROM {table_name}").show()

In [None]:
# 🔀 PART 2D: DELTA MERGE OPERATIONS (10 minutes)
print("\n🔀 DELTA MERGE OPERATIONS DEMO")
print("="*50)

# Create updates dataset
updates_data = [
    (5, 'Customer_005_Updated', 'updated5@example.com', 'Updated_City', datetime.now(), 'Premium'),
    (10, 'Customer_010_Updated', 'updated10@example.com', 'Updated_City', datetime.now(), 'Premium'),
    (103, 'Customer_103_New', 'customer103@example.com', 'Miami', datetime.now(), 'Active')  # New customer
]

updates_df = spark.createDataFrame(updates_data, 
    ['customer_id', 'customer_name', 'email', 'city', 'registration_date', 'status'])

print("📊 Updates to apply:")
updates_df.show()

# Create temporary view for MERGE
updates_df.createOrReplaceTempView("customer_updates")

# Perform MERGE operation
print("\n🔀 Executing MERGE operation...")
merge_sql = f"""
MERGE INTO {table_name} as target
USING customer_updates as source
ON target.customer_id = source.customer_id
WHEN MATCHED THEN
    UPDATE SET 
        customer_name = source.customer_name,
        email = source.email,
        city = source.city,
        status = source.status
WHEN NOT MATCHED THEN
    INSERT (customer_id, customer_name, email, city, registration_date, status)
    VALUES (source.customer_id, source.customer_name, source.email, source.city, source.registration_date, source.status)
"""

spark.sql(merge_sql)
print("✅ MERGE operation completed!")

# Verify results
print("\n📊 Results after MERGE:")
spark.sql(f"""
    SELECT customer_id, customer_name, city, status 
    FROM {table_name} 
    WHERE customer_id IN (5, 10, 103)
    ORDER BY customer_id
""").show()

print("\n📈 Total count after MERGE:")
spark.sql(f"SELECT COUNT(*) as total_customers FROM {table_name}").show()

---

# Part 3: Data Engineering Basics (30 minutes)

## Key Concepts:
- **Notebooks → Jobs** (scheduled execution)
- **Workflows** (multi-task orchestration)  
- **External Storage** (Azure Data Lake integration)
- **Delta Live Tables** (streaming ETL)

In [None]:
# 🔧 PART 3A: EXTERNAL STORAGE CONNECTION (10 minutes)
print("🔧 EXTERNAL STORAGE CONNECTION")
print("="*50)

# Check existing mounts
print("📎 Current mounted storage:")
mounts = dbutils.fs.mounts()
for mount in mounts:
    print(f"   🔗 {mount.mountPoint} → {mount.source}")

# Demo: How to mount Azure Data Lake (conceptual - requires credentials)
print(f"\n💡 Mounting Azure Data Lake Storage (ADLS) - Conceptual Example:")
mount_example = '''
# Mount ADLS Gen2 for your solution accelerator
dbutils.fs.mount(
    source = "abfss://container@storageaccount.dfs.core.windows.net/",
    mount_point = "/mnt/solution-accelerator",
    extra_configs = {
        "fs.azure.account.auth.type.storageaccount.dfs.core.windows.net": "OAuth",
        "fs.azure.account.oauth.provider.type.storageaccount.dfs.core.windows.net": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
        "fs.azure.account.oauth2.client.id.storageaccount.dfs.core.windows.net": "application-id",
        "fs.azure.account.oauth2.client.secret.storageaccount.dfs.core.windows.net": "service-credential-key"
    }
)
'''
print(mount_example)

# Show how to work with mounted storage
print("\n📁 Working with mounted storage:")
print("   • List files: dbutils.fs.ls('/mnt/solution-accelerator/')")
print("   • Read CSV: spark.read.csv('/mnt/solution-accelerator/data.csv')")
print("   • Write Delta: df.write.format('delta').save('/mnt/solution-accelerator/delta-table')")

# For your solution accelerator integration:
print(f"\n🎯 For Your Solution Accelerator:")
print("   1. Mount Azure Data Lake where Fabric can access")
print("   2. Write Delta tables to mounted location")  
print("   3. Fabric reads from same Azure Data Lake location")
print("   4. Creates seamless Databricks → Fabric pipeline")

In [None]:
# 📊 PART 3B: JOBS AND WORKFLOWS CONCEPTS (10 minutes)
print("\n📊 JOBS AND WORKFLOWS CONCEPTS")
print("="*50)

print("🔄 Converting Notebooks to Jobs:")
print("   1. Save your notebook (this one!)")
print("   2. Go to 'Workflows' in Databricks sidebar")
print("   3. Click 'Create Job'")
print("   4. Select this notebook as the task")
print("   5. Configure cluster and schedule")

print(f"\n⚙️ Job Configuration Example for Your Solution Accelerator:")
job_config = '''
Job Name: "Solution Accelerator Data Pipeline"
Tasks:
├── Task 1: "Data Ingestion" 
│   ├── Notebook: /Users/your-email/Data_Ingestion.ipynb
│   ├── Cluster: Job cluster (auto-terminating)
│   └── Libraries: Delta Lake, pandas
├── Task 2: "Data Transformation" (depends on Task 1)
│   ├── Notebook: /Users/your-email/Data_Transformation.ipynb  
│   └── Parameters: {"source_table": "bronze.customers"}
└── Task 3: "Data Quality Check" (depends on Task 2)
    ├── Notebook: /Users/your-email/Data_Quality.ipynb
    └── Alerts: Email on failure
    
Schedule: Daily at 2 AM
Retry: 3 attempts with exponential backoff
'''
print(job_config)

print(f"\n📈 Workflow Benefits:")
print("   ✅ Automated execution")
print("   ✅ Dependency management") 
print("   ✅ Error handling and retries")
print("   ✅ Monitoring and alerts")
print("   ✅ Parameter passing between tasks")

In [None]:
# 🌊 PART 3C: DELTA LIVE TABLES OVERVIEW (10 minutes)
print("\n🌊 DELTA LIVE TABLES (DLT) OVERVIEW")
print("="*50)

print("💡 What is Delta Live Tables?")
print("   • Declarative ETL framework")
print("   • Automatically manages dependencies")
print("   • Built-in data quality monitoring")  
print("   • Streaming and batch processing")

print(f"\n📝 DLT Example for Your Solution Accelerator:")
dlt_example = '''
# Bronze Layer - Raw data ingestion
@dlt.table(
    comment="Raw customer data from source systems"
)
def bronze_customers():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", "/mnt/schema/customers")
        .load("/mnt/raw-data/customers/")
    )

# Silver Layer - Cleaned and validated data  
@dlt.table(
    comment="Cleaned customer data with quality checks"
)
@dlt.expect("valid_email", "email IS NOT NULL AND email RLIKE '^[^@]+@[^@]+\\\\.[^@]+$'")
@dlt.expect_or_drop("customer_id_not_null", "customer_id IS NOT NULL")
def silver_customers():
    return (
        dlt.read("bronze_customers")
        .select("customer_id", "customer_name", "email", "city", "registration_date")
        .filter("customer_id IS NOT NULL")
    )

# Gold Layer - Business-ready aggregated data
@dlt.table(
    comment="Customer metrics for business analytics"
)
def gold_customer_metrics():
    return (
        dlt.read("silver_customers")
        .groupBy("city")
        .agg(
            count("customer_id").alias("customer_count"),
            max("registration_date").alias("latest_registration")
        )
    )
'''
print(dlt_example)

print(f"\n🎯 DLT Benefits for Your Architecture:")
print("   ✅ Automatic Bronze → Silver → Gold pipeline")
print("   ✅ Built-in data quality monitoring")
print("   ✅ Real-time streaming capability")
print("   ✅ Automatic dependency resolution")
print("   ✅ Lineage tracking")

---

# Part 4: Solution Accelerator Integration (15 minutes)

## How This Applies to Your Project

In [None]:
# 🎯 PART 4: SOLUTION ACCELERATOR INTEGRATION PATTERNS
print("🎯 SOLUTION ACCELERATOR INTEGRATION PATTERNS")
print("="*50)

print("🏗️ Your Architecture Integration:")
print("   Databricks (Online Channel) → Azure Data Lake → Fabric Bronze → Silver → Gold")

print(f"\n📋 Implementation Roadmap:")
roadmap = '''
Week 1-2: Databricks Setup
├── Create Unity Catalog structure (solution_accelerator.bronze/silver/gold)
├── Generate sample data in Delta format  
├── Set up external storage mounts to Azure Data Lake
└── Create basic ETL notebooks

Week 3-4: Data Pipeline Development  
├── Build Bronze → Silver transformation jobs
├── Implement data quality checks with Delta
├── Set up automated workflows
└── Create monitoring and alerting

Week 5-6: Fabric Integration
├── Configure Fabric to read from same Azure Data Lake
├── Implement cross-channel data merging in Fabric Silver tier
├── Build Gold tier analytics in Fabric
└── Create Power BI dashboards

Week 7-8: Production Readiness
├── Implement Delta Live Tables for streaming
├── Set up automated deployment pipelines  
├── Add comprehensive monitoring
└── Document integration patterns
'''
print(roadmap)

In [None]:
# 📊 PRACTICAL NEXT STEPS FOR YOUR PROJECT
print("\n📊 PRACTICAL NEXT STEPS FOR YOUR PROJECT")
print("="*50)

print("🔧 Immediate Actions (Today):")
print("   1. ✅ Run this crash course notebook")
print("   2. Create your sample data as Delta tables")
print("   3. Practice MERGE operations with your product/customer data")
print("   4. Set up basic Unity Catalog structure")

print(f"\n🚀 This Week:")
print("   1. Import your existing product generation notebook")
print("   2. Convert generated data to Delta format")  
print("   3. Create customer and order generation notebooks")
print("   4. Build basic Bronze → Silver transformation")

print(f"\n📈 Next Week:")
print("   1. Set up Azure Data Lake mount point")
print("   2. Create automated jobs for data processing")
print("   3. Test Fabric integration with Delta tables")
print("   4. Build monitoring and quality checks")

print(f"\n💡 Key Success Patterns:")
patterns = '''
✅ Unity Catalog for governance: catalog.schema.table naming
✅ Delta Lake for reliability: ACID transactions, time travel
✅ External mounts for integration: Databricks ↔ Fabric via ADLS
✅ Jobs/Workflows for automation: scheduled, monitored pipelines
✅ DLT for advanced ETL: streaming, quality, lineage
'''
print(patterns)

In [None]:
# 🎓 KNOWLEDGE CHECK AND SUMMARY
print("\n🎓 KNOWLEDGE CHECK - What You've Learned")
print("="*50)

print("📚 Unity Catalog (30 min):")
print("   ✅ Three-level namespace: catalog.schema.table")
print("   ✅ Create catalogs and schemas for medallion architecture")
print("   ✅ Understanding data governance and permissions")

print(f"\n🚀 Delta Lake (45 min):")
print("   ✅ Create Delta tables from sample data")
print("   ✅ Time travel and versioning capabilities")
print("   ✅ MERGE operations for upserts") 
print("   ✅ ACID transactions and optimization")

print(f"\n🔧 Data Engineering (30 min):")
print("   ✅ Convert notebooks to automated jobs")
print("   ✅ Set up workflows with dependencies")
print("   ✅ External storage integration patterns")
print("   ✅ Delta Live Tables for advanced ETL")

print(f"\n🎯 Ready for Your Solution Accelerator:")
print("   ✅ Technical foundation established")
print("   ✅ Integration patterns understood")
print("   ✅ Production-ready architecture knowledge")
print("   ✅ Hands-on experience with key features")

print(f"\n🚀 Total Learning Time: 2 hours")
print("🎉 You're now ready to build your enterprise solution accelerator!")

# Clean up demo data (optional)
try:
    print(f"\n🗑️ Cleaning up demo table (optional):")
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    print("   ✅ Demo table removed")
except:
    print("   ⚠️ Could not remove demo table (may not have permissions)")