# Synthetic Tabular Data Generation Job

This notebook generates synthetic tabular data using:
- **dbldatagen** for structured data generation
- **ai_query()** for GenAI Text columns
- **Databricks volumes** for storage

## Parameters
The following parameters are passed from the app via job widgets:


In [None]:
# Cell 1: Import required libraries
import json
import os
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, lit
from dbldatagen import DataGenerator, fakerText

print("📦 Libraries imported successfully")
print(f"   - Execution time: {datetime.now()}")


In [None]:
# Cell 2: Get job parameters via widgets
# Databricks automatically creates widgets from job parameters

try:
    # Create widgets with default values (these will be overridden by job parameters)
    dbutils.widgets.text("table_name", "sample_table", "Table Name")
    dbutils.widgets.text("row_count", "1000", "Row Count")
    dbutils.widgets.text("columns", "[]", "Columns JSON")
    dbutils.widgets.text("company_name", "Sample Company", "Company Name")
    dbutils.widgets.text("company_sector", "Technology", "Company Sector")
    dbutils.widgets.text("timestamp", datetime.now().strftime("%Y%m%d_%H%M%S"), "Timestamp")
    dbutils.widgets.text("endpoint_name", "databricks-gpt-oss-120b", "LLM Endpoint")
    dbutils.widgets.text("volume_path", "conor_smith.synthetic_data_app.synthetic_data_volume", "Volume Path")
    
    # Get parameter values
    table_name = dbutils.widgets.get("table_name")
    row_count = int(dbutils.widgets.get("row_count"))
    columns_json = dbutils.widgets.get("columns")
    company_name = dbutils.widgets.get("company_name")
    company_sector = dbutils.widgets.get("company_sector")
    timestamp = dbutils.widgets.get("timestamp")
    endpoint_name = dbutils.widgets.get("endpoint_name")
    volume_path = dbutils.widgets.get("volume_path")
    
    print("🎯 Job Parameters Retrieved:")
    print(f"   - Table name: {table_name}")
    print(f"   - Row count: {row_count}")
    print(f"   - Company: {company_name} ({company_sector})")
    print(f"   - Timestamp: {timestamp}")
    print(f"   - Endpoint: {endpoint_name}")
    print(f"   - Volume: {volume_path}")
    print(f"   - Columns JSON length: {len(columns_json)} characters")
    
except Exception as e:
    print(f"❌ Error getting parameters: {e}")
    # Fallback to default values
    table_name = "sample_table"
    row_count = 1000
    columns_json = "[]"
    company_name = "Sample Company"
    company_sector = "Technology"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    endpoint_name = "databricks-gpt-oss-120b"
    volume_path = "conor_smith.synthetic_data_app.synthetic_data_volume"
    print("⚠️  Using fallback default values")


In [None]:
# Cell 3: Parse and validate column configurations
try:
    columns = json.loads(columns_json)
    print(f"✅ Parsed {len(columns)} column configurations")
    
    # Show column details
    for i, col in enumerate(columns):
        col_name = col.get('name', 'unnamed')
        col_type = col.get('data_type', 'Unknown')
        print(f"   {i+1}. {col_name} ({col_type})")
        
        # Show additional details for specific types
        if col_type == 'Integer':
            min_val = col.get('min_value', 'not set')
            max_val = col.get('max_value', 'not set')
            ordered = col.get('ordered_values', False)
            print(f"      → Range: {min_val} to {max_val}")
            print(f"      → Ordered: {ordered}")
        elif col_type == 'Date':
            min_date = col.get('min_date', 'not set')
            max_date = col.get('max_date', 'not set')
            ordered = col.get('ordered_values', False)
            print(f"      → Date range: {min_date} to {max_date}")
            print(f"      → Ordered: {ordered}")
        elif col_type == 'GenAI Text':
            prompt = col.get('prompt', 'not set')
            max_tokens = col.get('max_tokens', 'not set')
            print(f"      → Prompt: {prompt[:50]}{'...' if len(prompt) > 50 else ''}")
            print(f"      → Max tokens: {max_tokens}")
        elif col_type == 'Custom Values':
            values = col.get('custom_values', [])
            weights = col.get('use_weights', False)
            ordered = col.get('ordered_values', False)
            print(f"      → Values: {values[:3]}{'...' if len(values) > 3 else ''}")
            print(f"      → Weighted: {weights}")
            print(f"      → Ordered: {ordered}")
    
    # Add sample columns if none provided
    if len(columns) == 0:
        print("⚠️  No columns configured, adding sample columns for testing")
        columns = [
            {"name": "id", "data_type": "Integer", "min_value": 1, "max_value": 1000, "ordered_values": False},
            {"name": "created_date", "data_type": "Date", "min_date": "2023-01-01", "max_date": "2024-12-31", "ordered_values": False},
            {"name": "first_name", "data_type": "First Name"},
            {"name": "last_name", "data_type": "Last Name"},
            {"name": "bio", "data_type": "GenAI Text", "prompt": "Write a short professional bio for <first_name> <last_name>", "max_tokens": 100}
        ]
        print(f"   → Added {len(columns)} sample columns")
        
except Exception as e:
    print(f"❌ Error parsing columns: {e}")
    print(f"   Raw columns_json: {columns_json}")
    # Use fallback columns
    columns = [
        {"name": "id", "data_type": "Integer", "min_value": 1, "max_value": 100, "ordered_values": False},
        {"name": "name", "data_type": "First Name"}
    ]
    print(f"   → Using {len(columns)} fallback columns")


In [None]:
# Cell 4: Initialize Spark and create DataGenerator
try:
    # Get Spark session
    spark = SparkSession.getActiveSession()
    if spark is None:
        spark = SparkSession.builder.appName("TabularDataGeneration").getOrCreate()
    
    print("⚡ Spark session initialized")
    print(f"   - Spark version: {spark.version}")
    
    # Set partition parameters for optimal performance
    partitions_requested = min(8, max(1, row_count // 1000))  
    spark.conf.set("spark.sql.shuffle.partitions", str(partitions_requested))
    
    print(f"🔧 Spark optimized for {row_count} rows → {partitions_requested} partitions")
    
    # Create DataGenerator
    data_gen = DataGenerator(spark, rows=row_count, partitions=partitions_requested)
    print(f"🏗️  DataGenerator created")
    
except Exception as e:
    print(f"❌ Error initializing Spark/DataGenerator: {e}")
    raise e


In [None]:
# Cell 5: Add columns to DataGenerator
genai_columns = []

for col_config in columns:
    col_name = col_config.get('name', 'unnamed_column')
    col_type = col_config.get('data_type', 'Integer')
    
    print(f"📊 Adding column '{col_name}' ({col_type})")
    
    try:
        if col_type == 'Integer':
            min_val = col_config.get('min_value', 1)
            max_val = col_config.get('max_value', 100)
            ordered_values = col_config.get('ordered_values', False)
            random_vals = not ordered_values  # Use random=True unless ordered is explicitly True
            data_gen = data_gen.withColumn(col_name, "integer", minValue=min_val, maxValue=max_val, random=random_vals)
            print(f"   ✅ Integer: {min_val} to {max_val} (random={random_vals})")
            
        elif col_type == 'Date':
            min_date = col_config.get('min_date', '2020-01-01')
            max_date = col_config.get('max_date', '2024-12-31')
            ordered_values = col_config.get('ordered_values', False)
            random_vals = not ordered_values  # Use random=True unless ordered is explicitly True
            
            try:
                # Import DateRange from dbldatagen
                import dbldatagen as dg
                
                # Validate date format and create proper datetime strings
                from datetime import datetime
                
                # Parse and validate dates
                try:
                    min_dt = datetime.strptime(min_date, '%Y-%m-%d')
                    max_dt = datetime.strptime(max_date, '%Y-%m-%d')
                    print(f"   - Parsed dates: {min_dt} to {max_dt}")
                except ValueError as date_error:
                    print(f"   ⚠️  Invalid date format: {date_error}")
                    # Use default dates if parsing fails
                    min_dt = datetime(2020, 1, 1)
                    max_dt = datetime(2024, 12, 31)
                    print(f"   - Using default dates: {min_dt} to {max_dt}")
                
                # Create date range with proper format for dbldatagen
                min_date_str = min_dt.strftime("%Y-%m-%d %H:%M:%S")
                max_date_str = max_dt.strftime("%Y-%m-%d %H:%M:%S")
                
                print(f"   - Creating DateRange with: '{min_date_str}' to '{max_date_str}'")
                date_range = dg.DateRange(min_date_str, max_date_str)
                print(f"   - DateRange created successfully")
                
                # Add column with date range
                data_gen = data_gen.withColumn(col_name, "date", data_range=date_range, random=random_vals)
                print(f"   ✅ Date: {min_date} to {max_date} (random={random_vals})")
                
            except Exception as date_gen_error:
                print(f"   ❌ Error with DateRange generation: {date_gen_error}")
                print(f"   🔄 Falling back to timestamp generation...")
                try:
                    # Fallback: Generate timestamps instead of dates
                    data_gen = data_gen.withColumn(col_name, "timestamp", 
                                                 begin="2020-01-01 00:00:00", 
                                                 end="2024-12-31 23:59:59", 
                                                 interval="1 day", random=random_vals)
                    print(f"   ✅ Timestamp fallback: {min_date} to {max_date} (random={random_vals})")
                except Exception as fallback_error:
                    print(f"   ❌ Timestamp fallback also failed: {fallback_error}")
                    # Final fallback: generate as string
                    data_gen = data_gen.withColumn(col_name, "string", values=[min_date])
                    print(f"   ⚠️  Using string fallback with constant date: {min_date}")
            
        elif col_type == 'First Name':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("first_name"))
                print(f"   ✅ First name with faker")
            except Exception:
                first_names = ["James", "Mary", "John", "Patricia", "Robert", "Jennifer", "Michael", "Linda", 
                              "William", "Elizabeth", "David", "Barbara", "Richard", "Susan", "Joseph", "Jessica"]
                data_gen = data_gen.withColumn(col_name, values=first_names)
                print(f"   ✅ First name with predefined list ({len(first_names)} names)")
            
        elif col_type == 'Last Name':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("last_name"))
                print(f"   ✅ Last name with faker")
            except Exception:
                last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
                             "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson"]
                data_gen = data_gen.withColumn(col_name, values=last_names)
                print(f"   ✅ Last name with predefined list ({len(last_names)} names)")
        
        elif col_type == 'Email':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("email"))
                print(f"   ✅ Email with faker")
            except Exception:
                emails = ["user@example.com", "test@domain.org", "sample@site.net", "admin@company.com", "contact@business.co"]
                data_gen = data_gen.withColumn(col_name, values=emails)
                print(f"   ✅ Email with predefined list ({len(emails)} emails)")
        
        elif col_type == 'Phone Number':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("phone_number"))
                print(f"   ✅ Phone number with faker")
            except Exception:
                phones = ["555-0100", "555-0101", "555-0102", "555-0103", "555-0104"]
                data_gen = data_gen.withColumn(col_name, values=phones)
                print(f"   ✅ Phone number with predefined list ({len(phones)} numbers)")
        
        elif col_type == 'Address':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("address"))
                print(f"   ✅ Address with faker")
            except Exception:
                addresses = ["123 Main St, New York, NY 10001", "456 Oak Ave, Los Angeles, CA 90210", 
                           "789 Pine Rd, Chicago, IL 60601", "321 Elm Dr, Houston, TX 77001"]
                data_gen = data_gen.withColumn(col_name, values=addresses)
                print(f"   ✅ Address with predefined list ({len(addresses)} addresses)")
        
        elif col_type == 'City':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("city"))
                print(f"   ✅ City with faker")
            except Exception:
                cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego"]
                data_gen = data_gen.withColumn(col_name, values=cities)
                print(f"   ✅ City with predefined list ({len(cities)} cities)")
        
        elif col_type == 'Country':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("country"))
                print(f"   ✅ Country with faker")
            except Exception:
                countries = ["United States", "Canada", "Mexico", "Brazil", "Argentina", "United Kingdom", "France", "Germany"]
                data_gen = data_gen.withColumn(col_name, values=countries)
                print(f"   ✅ Country with predefined list ({len(countries)} countries)")
        
        elif col_type == 'Country Code':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("country_code"))
                print(f"   ✅ Country code with faker")
            except Exception:
                codes = ["US", "CA", "MX", "BR", "AR", "GB", "FR", "DE"]
                data_gen = data_gen.withColumn(col_name, values=codes)
                print(f"   ✅ Country code with predefined list ({len(codes)} codes)")
        
        elif col_type == 'Postcode':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("postcode"))
                print(f"   ✅ Postcode with faker")
            except Exception:
                postcodes = ["10001", "90210", "60601", "77001", "85001", "19101", "78201", "92101"]
                data_gen = data_gen.withColumn(col_name, values=postcodes)
                print(f"   ✅ Postcode with predefined list ({len(postcodes)} postcodes)")
        
        elif col_type == 'Street Address':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("street_address"))
                print(f"   ✅ Street address with faker")
            except Exception:
                streets = ["123 Main St", "456 Oak Ave", "789 Pine Rd", "321 Elm Dr", "654 Cedar Ln"]
                data_gen = data_gen.withColumn(col_name, values=streets)
                print(f"   ✅ Street address with predefined list ({len(streets)} addresses)")
        
        elif col_type == 'Company':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("company"))
                print(f"   ✅ Company with faker")
            except Exception:
                companies = ["Acme Corp", "Global Industries", "Tech Solutions", "Data Systems", "Innovation Labs", "Future Enterprises"]
                data_gen = data_gen.withColumn(col_name, values=companies)
                print(f"   ✅ Company with predefined list ({len(companies)} companies)")
        
        elif col_type == 'Credit Card Number':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("credit_card_number"))
                print(f"   ✅ Credit card number with faker")
            except Exception:
                # Using fake but valid test credit card numbers
                cards = ["4111-1111-1111-1111", "4000-0000-0000-0002", "5555-5555-5555-4444", "3782-822463-10005"]
                data_gen = data_gen.withColumn(col_name, values=cards)
                print(f"   ✅ Credit card number with predefined list ({len(cards)} numbers)")
        
        elif col_type == 'Credit Card Provider':
            try:
                data_gen = data_gen.withColumn(col_name, text=fakerText("credit_card_provider"))
                print(f"   ✅ Credit card provider with faker")
            except Exception:
                providers = ["Visa", "Mastercard", "American Express", "Discover"]
                data_gen = data_gen.withColumn(col_name, values=providers)
                print(f"   ✅ Credit card provider with predefined list ({len(providers)} providers)")
        
        elif col_type == 'Latitude':
            try:
                # For latitude, use fakerText which returns decimal values
                data_gen = data_gen.withColumn(col_name, text=fakerText("latitude"))
                print(f"   ✅ Latitude with faker (decimal values)")
            except Exception:
                # Fallback to realistic latitude range (-90 to 90)
                import random
                latitudes = [round(random.uniform(-90.0, 90.0), 6) for _ in range(10)]
                data_gen = data_gen.withColumn(col_name, values=latitudes)
                print(f"   ✅ Latitude with predefined range ({len(latitudes)} values)")
        
        elif col_type == 'Longitude':
            try:
                # For longitude, use fakerText which returns decimal values  
                data_gen = data_gen.withColumn(col_name, text=fakerText("longitude"))
                print(f"   ✅ Longitude with faker (decimal values)")
            except Exception:
                # Fallback to realistic longitude range (-180 to 180)
                import random
                longitudes = [round(random.uniform(-180.0, 180.0), 6) for _ in range(10)]
                data_gen = data_gen.withColumn(col_name, values=longitudes)
                print(f"   ✅ Longitude with predefined range ({len(longitudes)} values)")
            
        elif col_type == 'GenAI Text':
            # Add placeholder, will process with ai_query later
            data_gen = data_gen.withColumn(col_name, "string", values=[""])
            genai_columns.append(col_config)
            print(f"   ✅ GenAI placeholder (will use ai_query)")
            
        elif col_type == 'Custom Values':
            custom_values = col_config.get('custom_values', [''])
            use_weights = col_config.get('use_weights', False)
            custom_weights = col_config.get('custom_weights', [1])
            ordered_values = col_config.get('ordered_values', False)
            random_vals = not ordered_values  # Use random=True unless ordered is explicitly True
            
            filtered_values = [v for v in custom_values if v.strip()]
            if not filtered_values:
                filtered_values = ['DefaultValue']
            
            if use_weights and len(custom_weights) >= len(filtered_values):
                filtered_weights = custom_weights[:len(filtered_values)]
                data_gen = data_gen.withColumn(col_name, values=filtered_values, weights=filtered_weights, random=random_vals)
                print(f"   ✅ Custom values with weights: {len(filtered_values)} values (random={random_vals})")
            else:
                data_gen = data_gen.withColumn(col_name, values=filtered_values, random=random_vals)
                print(f"   ✅ Custom values: {len(filtered_values)} values (random={random_vals})")
        
        else:
            print(f"   ⚠️  Unknown column type '{col_type}', skipping")
            
    except Exception as col_error:
        print(f"   ❌ Error adding column '{col_name}': {col_error}")

print(f"\n📋 Summary: {len(columns)} total columns, {len(genai_columns)} GenAI columns")


In [None]:
# Cell 6: Generate initial DataFrame and process GenAI columns
# Build the initial DataFrame
print(f"🏗️  Building DataFrame with {row_count} rows...")
df = data_gen.build()

print(f"✅ DataFrame created: {df.count()} rows × {len(df.columns)} columns")
print(f"   - Columns: {df.columns}")

# Show sample data
print(f"\n📊 Sample Data (first 3 rows):")
df.show(3, truncate=False)

# Process GenAI Text columns with ai_query
if genai_columns:
    print(f"\n🤖 Processing {len(genai_columns)} GenAI Text columns with ai_query")
    
    def substitute_column_references_spark(prompt_template, columns):
        """Create Spark SQL expression for column substitution."""
        import re
        column_refs = re.findall(r'<([^<>]+)>', prompt_template)
        
        if not column_refs:
            return f"'{prompt_template}'"
        
        # Build concat expression for dynamic prompt
        parts = []
        current_pos = 0
        
        for match in re.finditer(r'<([^<>]+)>', prompt_template):
            col_name = match.group(1)
            start_pos = match.start()
            end_pos = match.end()
            
            # Add text before column reference
            if start_pos > current_pos:
                literal_text = prompt_template[current_pos:start_pos]
                if literal_text:
                    parts.append(f"'{literal_text}'")
            
            # Add column reference
            valid_columns = [col.get('name', 'unnamed_column') for col in columns]
            if col_name in valid_columns:
                parts.append(f"coalesce(cast({col_name} as string), 'NULL')")
            else:
                parts.append(f"'<{col_name}>'")
            
            current_pos = end_pos
        
        # Add remaining text
        if current_pos < len(prompt_template):
            literal_text = prompt_template[current_pos:]
            if literal_text:
                parts.append(f"'{literal_text}'")
        
        return f"concat({', '.join(parts)})" if len(parts) > 1 else parts[0]
    
    # Process each GenAI column
    for col_config in genai_columns:
        col_name = col_config.get('name', 'unnamed_column')
        prompt_template = col_config.get('prompt', '')
        
        if prompt_template:
            print(f"\n🎯 Processing GenAI column '{col_name}'")
            print(f"   - Prompt: {prompt_template[:80]}{'...' if len(prompt_template) > 80 else ''}")
            
            try:
                # Enhanced prompt for table context
                enhanced_prompt = f"{prompt_template} Note: This will be text data in a table so omit all special formatting."
                
                # Create dynamic prompt with column substitution
                prompt_expression = substitute_column_references_spark(enhanced_prompt, columns)
                print(f"   - Spark expression created")
                
                # Execute ai_query
                print(f"   - Executing ai_query with endpoint: {endpoint_name}")
                df = df.withColumn(
                    col_name,
                    expr(f"ai_query(endpoint => '{endpoint_name}', request => {prompt_expression})")
                )
                
                print(f"   ✅ ai_query completed for '{col_name}'")
                
                # Show sample generated text
                sample_rows = df.select(col_name).limit(2).collect()
                for i, row in enumerate(sample_rows):
                    text_sample = str(row[col_name])[:100] + ('...' if len(str(row[col_name])) > 100 else '')
                    print(f"   - Sample {i+1}: {text_sample}")
                
            except Exception as ai_error:
                print(f"   ❌ Error processing GenAI column '{col_name}': {ai_error}")
    
    print(f"\n✅ All GenAI columns processed")
else:
    print(f"\nℹ️  No GenAI columns to process")


In [None]:
# Cell 7: Save to Volume and complete job
try:
    filename = f"{table_name}_{timestamp}.csv"
    print(f"💾 Saving data to volume...")
    print(f"   - Filename: {filename}")
    print(f"   - Volume parameter: {volume_path}")
    
    # Convert to Pandas for clean CSV creation
    print(f"📊 Converting to Pandas...")
    pandas_df = df.toPandas()
    print(f"   ✅ Pandas DataFrame: {len(pandas_df)} rows × {len(pandas_df.columns)} columns")
    
    # Write to temporary location
    temp_path = f"/tmp/{filename}"
    pandas_df.to_csv(temp_path, index=False)
    print(f"   ✅ Temporary file created: {temp_path}")
    
    # Fix volume path format - convert dots to forward slashes for Unity Catalog
    # Expected format: catalog.schema.volume → /Volumes/catalog/schema/volume
    if '.' in volume_path:
        # Split volume_path and reconstruct proper Unity Catalog path
        parts = volume_path.split('.')
        if len(parts) >= 3:
            catalog, schema, volume = parts[0], parts[1], parts[2]
            corrected_volume_path = f"/Volumes/{catalog}/{schema}/{volume}"
        else:
            # Fallback if format is unexpected
            corrected_volume_path = f"/Volumes/{volume_path.replace('.', '/')}"
    else:
        # Already in correct format or use as-is
        corrected_volume_path = f"/Volumes/{volume_path}" if not volume_path.startswith('/Volumes/') else volume_path
    
    volume_file_path = f"{corrected_volume_path}/{filename}"
    print(f"📤 Corrected volume path: {corrected_volume_path}")
    print(f"📤 Full file path: {volume_file_path}")
    
    try:
        # Ensure volume directory exists
        print(f"📁 Ensuring volume directory exists...")
        dbutils.fs.mkdirs(corrected_volume_path)
        
        # Copy file to volume
        dbutils.fs.cp(f"file://{temp_path}", volume_file_path)
        print(f"   ✅ Successfully saved to volume!")
        
        # Verify file
        try:
            file_info = dbutils.fs.ls(volume_file_path)
            file_size = file_info[0].size if file_info else 0
            print(f"   📋 File size: {file_size:,} bytes")
        except:
            print(f"   ⚠️  Could not verify file size")
        
        # Clean up temp file
        dbutils.fs.rm(f"file://{temp_path}")
        print(f"   🧹 Cleaned up temporary file")
        
    except Exception as volume_error:
        print(f"   ❌ Error copying to volume: {volume_error}")
        print(f"   📁 File remains at: {temp_path}")
        volume_file_path = temp_path
    
    # Final summary
    print(f"\n🎉 Job Completed Successfully!")
    print(f"📋 Final Summary:")
    print(f"   - Table: {table_name}")
    print(f"   - Rows: {df.count():,}")
    print(f"   - Columns: {len(df.columns)}")
    print(f"   - GenAI columns: {len(genai_columns)}")
    print(f"   - Company: {company_name} ({company_sector})")
    print(f"   - File: {volume_file_path}")
    print(f"   - Completed: {datetime.now()}")
    
    # Show final sample
    print(f"\n📊 Final Data Sample:")
    df.show(5, truncate=False)
    
except Exception as save_error:
    print(f"❌ Error during save: {save_error}")
    print(f"   Job may have succeeded but file save failed")
    raise save_error
