# Master ETL Pipeline for Job Data Processing

**Enterprise-Ready Data Cleaning Pipeline**

This notebook provides a comprehensive, user-friendly interface for processing multiple job data CSV files into a single, clean master dataset.

## Features
- 🔄 **Multi-file processing** - Upload and process multiple CSV files simultaneously
- 📊 **Schema validation** - Automatic detection and handling of schema inconsistencies  
- 🎯 **Master file output** - Consolidated dataset with job_id as primary key
- 📈 **Progress tracking** - Real-time processing updates with visual feedback
- ☁️ **BigQuery integration** - Optional direct upload to Google BigQuery
- 📋 **Comprehensive reporting** - Detailed data quality and transformation reports

## Instructions
**For non-technical users:** Simply run cells 1-6 in order. Upload your CSV files when prompted, configure output options, and download your cleaned master dataset.

## 1. Environment Setup & Validation

Setting up the processing environment and validating all required packages.

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Core data processing
import pandas as pd
import numpy as np
import io
import logging
from functools import reduce
import re
from datetime import datetime

# Interactive widgets and display
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output, Markdown

# File handling
import glob
import os
from pathlib import Path

# Optional BigQuery support
try:
    from google.cloud import bigquery
    BIGQUERY_AVAILABLE = True
    print("✅ BigQuery integration available")
except ImportError:
    BIGQUERY_AVAILABLE = False
    print("ℹ️ BigQuery not available (install google-cloud-bigquery for BigQuery features)")

# Import ETL logic
try:
    from etl_pipeline_logic import assess_raw_data, clean_csv_data
    ETL_LOGIC_AVAILABLE = True
    print("✅ ETL processing logic loaded successfully")
except ImportError:
    ETL_LOGIC_AVAILABLE = False
    print("❌ ETL logic module not found. Ensure 'etl_pipeline_logic.py' is in the same directory.")

# Validate core packages
required_packages = {'pandas': pd, 'numpy': np, 'ipywidgets': widgets}
missing_packages = []

for package_name, package_module in required_packages.items():
    try:
        version = package_module.__version__
        print(f"✅ {package_name}: {version}")
    except AttributeError:
        print(f"✅ {package_name}: Available")

if missing_packages:
    print(f"\n❌ Missing packages: {', '.join(missing_packages)}")
    print("Please install missing packages before proceeding.")
    ENVIRONMENT_READY = False
else:
    ENVIRONMENT_READY = True
    print(f"\n🚀 Environment ready for processing! All systems operational.")

# Configure logging for this session
log_filename = f"etl_session_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename, mode='w'),
        logging.StreamHandler()
    ],
    force=True
)
logger = logging.getLogger(__name__)
print(f"📝 Session log: {log_filename}")

## 2. File Upload & Configuration

Upload your CSV files and configure processing options.

In [None]:
if not ENVIRONMENT_READY or not ETL_LOGIC_AVAILABLE:
    display(HTML('<div style="background-color: #ffebee; padding: 15px; border-radius: 5px; border-left: 4px solid #f44336;"><strong>⚠️ Cannot proceed:</strong> Environment setup failed or ETL logic unavailable. Please resolve issues above.</div>'))
else:
    # File upload widget
    file_uploader = widgets.FileUpload(
        accept='.csv',
        multiple=True,
        description='Upload CSV Files',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='100%')
    )
    
    # Output configuration
    output_name = widgets.Text(
        value='master_job_data',
        description='Master file name:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )
    
    # BigQuery configuration (if available)
    if BIGQUERY_AVAILABLE:
        enable_bigquery = widgets.Checkbox(
            value=False,
            description='Upload to BigQuery',
            style={'description_width': 'initial'}
        )
        
        bq_project = widgets.Text(
            value='',
            description='GCP Project ID:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='300px')
        )
        
        bq_dataset = widgets.Text(
            value='job_data',
            description='Dataset Name:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='300px')
        )
        
        bq_table = widgets.Text(
            value='cleaned_jobs',
            description='Table Name:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='300px')
        )
    
    # Processing options
    handle_schema_mismatch = widgets.Dropdown(
        options=['Auto-align', 'Strict validation', 'Skip mismatched'],
        value='Auto-align',
        description='Schema handling:',
        style={'description_width': 'initial'}
    )
    
    merge_strategy = widgets.Dropdown(
        options=['Outer join (keep all records)', 'Inner join (matching records only)', 'No merge (separate files)'],
        value='Outer join (keep all records)',
        description='Merge strategy:',
        style={'description_width': 'initial'}
    )
    
    # Display configuration UI
    display(HTML('<h3>📁 File Upload</h3>'))
    display(file_uploader)
    
    display(HTML('<h3>⚙️ Configuration</h3>'))
    config_box = widgets.VBox([
        widgets.HBox([
            widgets.VBox([output_name, handle_schema_mismatch]),
            widgets.VBox([merge_strategy])
        ])
    ])
    display(config_box)
    
    if BIGQUERY_AVAILABLE:
        display(HTML('<h3>☁️ BigQuery Options (Optional)</h3>'))
        bq_box = widgets.VBox([
            enable_bigquery,
            widgets.HBox([bq_project, bq_dataset, bq_table])
        ])
        display(bq_box)
    
    # Status display area
    status_output = widgets.Output()
    display(HTML('<h3>📊 Processing Status</h3>'))
    display(status_output)

## 3. Schema Analysis & Validation

Analyzing uploaded files for schema consistency and data quality.

In [None]:
def analyze_file_schemas(uploaded_files):
    """Analyze schemas of all uploaded files"""
    schemas = {}
    file_info = {}
    
    for file_name, file_info_dict in uploaded_files.items():
        try:
            content = file_info_dict['content']
            
            # Quick assessment
            assessment = assess_raw_data(content, file_name)
            
            # Load first few rows to get schema
            df_sample = pd.read_csv(
                io.BytesIO(content),
                delimiter=assessment.get('likely_delimiter', ','),
                encoding=assessment.get('working_encoding', 'utf-8'),
                nrows=5
            )
            
            schemas[file_name] = {
                'columns': list(df_sample.columns),
                'dtypes': df_sample.dtypes.to_dict(),
                'row_count_sample': len(df_sample),
                'assessment': assessment
            }
            
            file_info[file_name] = {
                'size_mb': len(content) / (1024 * 1024),
                'encoding': assessment.get('working_encoding', 'utf-8'),
                'delimiter': assessment.get('likely_delimiter', ','),
                'issues': assessment.get('issues_found', [])
            }
            
        except Exception as e:
            schemas[file_name] = {'error': str(e)}
            file_info[file_name] = {'error': str(e)}
    
    return schemas, file_info

def check_schema_compatibility(schemas):
    """Check if schemas are compatible for merging"""
    if len(schemas) <= 1:
        return True, []
    
    # Get all unique columns across files
    all_columns = set()
    for schema in schemas.values():
        if 'columns' in schema:
            all_columns.update(schema['columns'])
    
    # Check for common job_id extraction potential
    has_url_column = {}
    for file_name, schema in schemas.items():
        if 'columns' in schema:
            url_cols = [col for col in schema['columns'] if any(keyword in col.lower() for keyword in ['url', 'link', 'href'])]
            has_url_column[file_name] = len(url_cols) > 0
    
    compatibility_issues = []
    
    # Check for major column differences
    column_sets = {}
    for file_name, schema in schemas.items():
        if 'columns' in schema:
            column_sets[file_name] = set(schema['columns'])
    
    if len(column_sets) > 1:
        common_columns = set.intersection(*column_sets.values())
        if len(common_columns) < 3:  # Very few common columns
            compatibility_issues.append(f"Files have very few common columns ({len(common_columns)})")
    
    return len(compatibility_issues) == 0, compatibility_issues

# Schema analysis button and results
analyze_button = widgets.Button(
    description='🔍 Analyze Schemas',
    button_style='info',
    layout=widgets.Layout(width='200px')
)

schema_output = widgets.Output()

def on_analyze_click(b):
    with schema_output:
        clear_output()
        
        if not file_uploader.value:
            display(HTML('<div style="background-color: #fff3e0; padding: 10px; border-radius: 5px;"><strong>ℹ️ No files uploaded yet.</strong> Please upload CSV files first.</div>'))
            return
        
        display(HTML('<div style="background-color: #e8f5e8; padding: 10px; border-radius: 5px;"><strong>🔍 Analyzing schemas...</strong></div>'))
        
        try:
            schemas, file_info = analyze_file_schemas(file_uploader.value)
            compatible, issues = check_schema_compatibility(schemas)
            
            # Display file information table
            display(HTML('<h4>📋 File Information</h4>'))
            
            file_data = []
            for file_name in schemas.keys():
                info = file_info.get(file_name, {})
                schema = schemas.get(file_name, {})
                
                if 'error' in schema:
                    status = f"❌ Error: {schema['error']}"
                    columns = "N/A"
                    size = "N/A"
                else:
                    status = "✅ Valid" if not info.get('issues') else f"⚠️ {len(info['issues'])} issues"
                    columns = f"{len(schema['columns'])} columns"
                    size = f"{info.get('size_mb', 0):.2f} MB"
                
                file_data.append([file_name, status, columns, size])
            
            file_df = pd.DataFrame(file_data, columns=['File Name', 'Status', 'Columns', 'Size'])
            display(file_df)
            
            # Display compatibility status
            if compatible:
                display(HTML('<div style="background-color: #e8f5e8; padding: 10px; border-radius: 5px; margin-top: 10px;"><strong>✅ Schema Compatibility:</strong> Files are compatible for merging</div>'))
            else:
                display(HTML(f'<div style="background-color: #ffebee; padding: 10px; border-radius: 5px; margin-top: 10px;"><strong>⚠️ Schema Issues:</strong><br>{"<br>".join(issues)}</div>'))
            
            # Store analysis results for next step
            global schema_analysis_results
            schema_analysis_results = {
                'schemas': schemas,
                'file_info': file_info,
                'compatible': compatible,
                'issues': issues
            }
            
        except Exception as e:
            display(HTML(f'<div style="background-color: #ffebee; padding: 10px; border-radius: 5px;"><strong>❌ Analysis Error:</strong> {str(e)}</div>'))

analyze_button.on_click(on_analyze_click)

if ENVIRONMENT_READY and ETL_LOGIC_AVAILABLE:
    display(analyze_button)
    display(schema_output)

## 4. Data Processing & Cleaning

Processing all uploaded files with real-time progress tracking.

In [None]:
def process_files_with_progress(uploaded_files, config):
    """Process all files with progress tracking"""
    results = {}
    
    # Create progress widgets
    overall_progress = widgets.IntProgress(
        value=0,
        min=0,
        max=len(uploaded_files),
        description='Overall:',
        bar_style='info',
        style={'bar_color': '#2196F3'},
        layout=widgets.Layout(width='100%')
    )
    
    current_file_label = widgets.HTML(value="<b>Ready to start processing...</b>")
    file_progress = widgets.IntProgress(
        value=0,
        min=0,
        max=100,
        description='Current file:',
        bar_style='info',
        style={'bar_color': '#4CAF50'},
        layout=widgets.Layout(width='100%')
    )
    
    progress_display = widgets.VBox([
        widgets.HTML("<h4>📈 Processing Progress</h4>"),
        overall_progress,
        current_file_label,
        file_progress
    ])
    
    display(progress_display)
    
    # Process each file
    for i, (file_name, file_info_dict) in enumerate(uploaded_files.items()):
        current_file_label.value = f"<b>Processing: {file_name}</b>"
        file_progress.value = 0
        
        try:
            content = file_info_dict['content']
            
            # Step 1: Assessment (20%)
            file_progress.value = 20
            assessment = assess_raw_data(content, file_name)
            
            # Step 2: Cleaning (80%)
            file_progress.value = 50
            cleaned_df, report_df = clean_csv_data(content, file_name, assessment)
            file_progress.value = 100
            
            if cleaned_df is not None:
                # Add source file column for traceability
                cleaned_df['source_file'] = file_name
                
                results[file_name] = {
                    'success': True,
                    'dataframe': cleaned_df,
                    'report': report_df,
                    'assessment': assessment,
                    'original_rows': len(cleaned_df),
                    'job_ids_found': cleaned_df['job_id'].notna().sum() if 'job_id' in cleaned_df.columns else 0
                }
            else:
                results[file_name] = {
                    'success': False,
                    'error': 'Failed to process file',
                    'assessment': assessment
                }
                
        except Exception as e:
            results[file_name] = {
                'success': False,
                'error': str(e)
            }
        
        # Update overall progress
        overall_progress.value = i + 1
    
    current_file_label.value = f"<b>✅ Processing complete! Processed {len(uploaded_files)} files.</b>"
    overall_progress.bar_style = 'success'
    file_progress.bar_style = 'success'
    
    return results

# Processing button and controls
process_button = widgets.Button(
    description='🚀 Process All Files',
    button_style='success',
    layout=widgets.Layout(width='200px')
)

processing_output = widgets.Output()

def on_process_click(b):
    with processing_output:
        clear_output()
        
        if not file_uploader.value:
            display(HTML('<div style="background-color: #fff3e0; padding: 10px; border-radius: 5px;"><strong>ℹ️ No files to process.</strong> Please upload CSV files first.</div>'))
            return
        
        # Get configuration
        config = {
            'output_name': output_name.value,
            'schema_handling': handle_schema_mismatch.value,
            'merge_strategy': merge_strategy.value
        }
        
        if BIGQUERY_AVAILABLE and 'enable_bigquery' in locals():
            config['bigquery'] = {
                'enabled': enable_bigquery.value,
                'project': bq_project.value,
                'dataset': bq_dataset.value,
                'table': bq_table.value
            }
        
        # Start processing
        logger.info(f"Starting batch processing of {len(file_uploader.value)} files")
        
        global processing_results
        processing_results = process_files_with_progress(file_uploader.value, config)
        
        # Display summary
        successful_files = [name for name, result in processing_results.items() if result.get('success', False)]
        failed_files = [name for name, result in processing_results.items() if not result.get('success', False)]
        
        display(HTML('<h4>📊 Processing Summary</h4>'))
        
        if successful_files:
            total_rows = sum(processing_results[name]['original_rows'] for name in successful_files)
            total_job_ids = sum(processing_results[name]['job_ids_found'] for name in successful_files)
            
            display(HTML(f'''
            <div style="background-color: #e8f5e8; padding: 15px; border-radius: 5px; margin: 10px 0;">
                <strong>✅ Successfully processed {len(successful_files)} files:</strong><br>
                📊 Total rows: {total_rows:,}<br>
                🆔 Job IDs extracted: {total_job_ids:,}<br>
                📁 Files: {", ".join(successful_files)}
            </div>
            '''))
        
        if failed_files:
            display(HTML(f'''
            <div style="background-color: #ffebee; padding: 15px; border-radius: 5px; margin: 10px 0;">
                <strong>❌ Failed to process {len(failed_files)} files:</strong><br>
                📁 Files: {", ".join(failed_files)}
            </div>
            '''))
            
            for file_name in failed_files:
                error_msg = processing_results[file_name].get('error', 'Unknown error')
                display(HTML(f'<div style="margin-left: 20px; color: #d32f2f;"><strong>{file_name}:</strong> {error_msg}</div>'))

process_button.on_click(on_process_click)

if ENVIRONMENT_READY and ETL_LOGIC_AVAILABLE:
    display(process_button)
    display(processing_output)

## 5. Master File Creation & Merging

Creating the consolidated master dataset from all processed files.

In [None]:
def create_master_dataset(processing_results, merge_strategy):
    """Create master dataset from processing results"""
    
    # Get successful dataframes
    successful_dfs = []
    for file_name, result in processing_results.items():
        if result.get('success') and 'dataframe' in result:
            df = result['dataframe']
            if df is not None and not df.empty:
                successful_dfs.append(df)
    
    if not successful_dfs:
        return None, "No valid dataframes to merge"
    
    if len(successful_dfs) == 1:
        return successful_dfs[0], "Single file - no merging needed"
    
    try:
        if merge_strategy == 'Outer join (keep all records)':
            # Merge all dataframes on job_id with outer join
            master_df = successful_dfs[0]
            for df in successful_dfs[1:]:
                master_df = pd.merge(master_df, df, on='job_id', how='outer', suffixes=('', '_dup'))
                
                # Remove duplicate columns (keep the first occurrence)
                duplicate_cols = [col for col in master_df.columns if col.endswith('_dup')]
                master_df = master_df.drop(columns=duplicate_cols)
            
            merge_info = f"Outer join merge of {len(successful_dfs)} files"
            
        elif merge_strategy == 'Inner join (matching records only)':
            # Merge all dataframes on job_id with inner join
            master_df = successful_dfs[0]
            for df in successful_dfs[1:]:
                master_df = pd.merge(master_df, df, on='job_id', how='inner', suffixes=('', '_dup'))
                
                # Remove duplicate columns
                duplicate_cols = [col for col in master_df.columns if col.endswith('_dup')]
                master_df = master_df.drop(columns=duplicate_cols)
            
            merge_info = f"Inner join merge of {len(successful_dfs)} files"
            
        else:  # No merge
            # Concatenate all dataframes
            master_df = pd.concat(successful_dfs, ignore_index=True)
            merge_info = f"Concatenated {len(successful_dfs)} files without merging"
        
        return master_df, merge_info
        
    except Exception as e:
        return None, f"Merge failed: {str(e)}"

def generate_master_report(master_df, processing_results):
    """Generate comprehensive report for master dataset"""
    
    report_data = {
        'Dataset Overview': {
            'Total Rows': f"{len(master_df):,}",
            'Total Columns': len(master_df.columns),
            'Unique Job IDs': f"{master_df['job_id'].nunique():,}" if 'job_id' in master_df.columns else 'N/A',
            'Source Files': master_df['source_file'].nunique() if 'source_file' in master_df.columns else 'N/A'
        }
    }
    
    # File-specific statistics
    if 'source_file' in master_df.columns:
        file_stats = master_df['source_file'].value_counts()
        report_data['Records per Source File'] = file_stats.to_dict()
    
    # Data quality summary
    missing_data = master_df.isnull().sum()
    high_missing = missing_data[missing_data > len(master_df) * 0.1]  # >10% missing
    
    if not high_missing.empty:
        report_data['Data Quality Alerts'] = {
            'Columns with >10% missing data': high_missing.to_dict()
        }
    
    return report_data

# Master dataset creation button
create_master_button = widgets.Button(
    description='🎯 Create Master Dataset',
    button_style='warning',
    layout=widgets.Layout(width='220px')
)

master_output = widgets.Output()

def on_create_master_click(b):
    with master_output:
        clear_output()
        
        if 'processing_results' not in globals():
            display(HTML('<div style="background-color: #fff3e0; padding: 10px; border-radius: 5px;"><strong>ℹ️ No processed data available.</strong> Please process files first.</div>'))
            return
        
        display(HTML('<div style="background-color: #e3f2fd; padding: 10px; border-radius: 5px;"><strong>🎯 Creating master dataset...</strong></div>'))
        
        # Create master dataset
        master_df, merge_info = create_master_dataset(processing_results, merge_strategy.value)
        
        if master_df is None:
            display(HTML(f'<div style="background-color: #ffebee; padding: 10px; border-radius: 5px;"><strong>❌ Failed to create master dataset:</strong> {merge_info}</div>'))
            return
        
        # Generate report
        report = generate_master_report(master_df, processing_results)
        
        # Store master dataset globally
        global master_dataset
        master_dataset = {
            'dataframe': master_df,
            'report': report,
            'merge_info': merge_info
        }
        
        # Display success and preview
        display(HTML(f'''
        <div style="background-color: #e8f5e8; padding: 15px; border-radius: 5px; margin: 10px 0;">
            <strong>✅ Master dataset created successfully!</strong><br>
            📊 {merge_info}<br>
            📈 Final dataset: {len(master_df):,} rows × {len(master_df.columns)} columns
        </div>
        '''))
        
        # Display report
        display(HTML('<h4>📋 Master Dataset Report</h4>'))
        for section, data in report.items():
            display(HTML(f'<h5>{section}</h5>'))
            if isinstance(data, dict):
                for key, value in data.items():
                    display(HTML(f'<strong>{key}:</strong> {value}<br>'))
            else:
                display(HTML(f'{data}<br>'))
        
        # Display preview
        display(HTML('<h4>👀 Data Preview (First 5 Rows)</h4>'))
        display(master_df.head())
        
        logger.info(f"Master dataset created: {len(master_df)} rows, {len(master_df.columns)} columns")

create_master_button.on_click(on_create_master_click)

if ENVIRONMENT_READY and ETL_LOGIC_AVAILABLE:
    display(create_master_button)
    display(master_output)

## 6. Export & Download

Save your master dataset and reports.

In [None]:
def save_master_dataset(master_data, output_name, enable_bq=False, bq_config=None):
    """Save master dataset to CSV and optionally to BigQuery"""
    
    df = master_data['dataframe']
    
    # Save to CSV
    csv_filename = f"{output_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(csv_filename, index=False)
    
    # Save report
    report_filename = f"{output_name}_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    with open(report_filename, 'w') as f:
        f.write("MASTER DATASET PROCESSING REPORT\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Merge Info: {master_data['merge_info']}\n\n")
        
        for section, data in master_data['report'].items():
            f.write(f"{section}:\n")
            f.write("-" * len(section) + "\n")
            if isinstance(data, dict):
                for key, value in data.items():
                    f.write(f"  {key}: {value}\n")
            else:
                f.write(f"  {data}\n")
            f.write("\n")
    
    results = {
        'csv_file': csv_filename,
        'report_file': report_filename,
        'csv_size_mb': os.path.getsize(csv_filename) / (1024 * 1024)
    }
    
    # BigQuery upload if enabled
    if enable_bq and BIGQUERY_AVAILABLE and bq_config:
        try:
            client = bigquery.Client(project=bq_config['project'])
            
            # Create dataset if it doesn't exist
            dataset_id = f"{bq_config['project']}.{bq_config['dataset']}"
            try:
                client.get_dataset(dataset_id)
            except:
                dataset = bigquery.Dataset(dataset_id)
                client.create_dataset(dataset)
            
            # Upload to BigQuery
            table_id = f"{dataset_id}.{bq_config['table']}"
            
            job_config = bigquery.LoadJobConfig(
                write_disposition="WRITE_TRUNCATE",  # Overwrite table
            )
            
            job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
            job.result()  # Wait for completion
            
            results['bigquery_success'] = True
            results['bigquery_table'] = table_id
            
        except Exception as e:
            results['bigquery_success'] = False
            results['bigquery_error'] = str(e)
    
    return results

# Export controls
export_button = widgets.Button(
    description='💾 Export Master Dataset',
    button_style='success',
    layout=widgets.Layout(width='220px')
)

download_links = widgets.HTML()
export_output = widgets.Output()

def on_export_click(b):
    with export_output:
        clear_output()
        
        if 'master_dataset' not in globals():
            display(HTML('<div style="background-color: #fff3e0; padding: 10px; border-radius: 5px;"><strong>ℹ️ No master dataset available.</strong> Please create master dataset first.</div>'))
            return
        
        display(HTML('<div style="background-color: #e3f2fd; padding: 10px; border-radius: 5px;"><strong>💾 Exporting master dataset...</strong></div>'))
        
        # Get BigQuery config if available
        bq_config = None
        enable_bq = False
        
        if BIGQUERY_AVAILABLE and 'enable_bigquery' in locals() and enable_bigquery.value:
            if bq_project.value:
                enable_bq = True
                bq_config = {
                    'project': bq_project.value,
                    'dataset': bq_dataset.value,
                    'table': bq_table.value
                }
        
        # Export
        try:
            results = save_master_dataset(
                master_dataset, 
                output_name.value, 
                enable_bq=enable_bq, 
                bq_config=bq_config
            )
            
            # Display success message
            display(HTML(f'''
            <div style="background-color: #e8f5e8; padding: 15px; border-radius: 5px; margin: 10px 0;">
                <strong>✅ Export completed successfully!</strong><br><br>
                📄 <strong>CSV File:</strong> {results['csv_file']} ({results['csv_size_mb']:.2f} MB)<br>
                📋 <strong>Report:</strong> {results['report_file']}<br>
            </div>
            '''))
            
            # BigQuery results
            if enable_bq:
                if results.get('bigquery_success'):
                    display(HTML(f'''
                    <div style="background-color: #e3f2fd; padding: 15px; border-radius: 5px; margin: 10px 0;">
                        <strong>☁️ BigQuery Upload Successful!</strong><br>
                        📊 Table: {results['bigquery_table']}<br>
                        🔗 View in BigQuery console
                    </div>
                    '''))
                else:
                    display(HTML(f'''
                    <div style="background-color: #fff3e0; padding: 15px; border-radius: 5px; margin: 10px 0;">
                        <strong>⚠️ BigQuery Upload Failed:</strong><br>
                        {results.get('bigquery_error', 'Unknown error')}
                    </div>
                    '''))
            
            # Update download links
            download_links.value = f'''
            <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin: 10px 0;">
                <h4>📥 Download Files</h4>
                🔗 <a href="{results['csv_file']}" download>Download Master Dataset CSV</a><br>
                🔗 <a href="{results['report_file']}" download>Download Processing Report</a>
            </div>
            '''
            
            logger.info(f"Export completed: {results['csv_file']} ({results['csv_size_mb']:.2f} MB)")
            
        except Exception as e:
            display(HTML(f'<div style="background-color: #ffebee; padding: 10px; border-radius: 5px;"><strong>❌ Export failed:</strong> {str(e)}</div>'))

export_button.on_click(on_export_click)

if ENVIRONMENT_READY and ETL_LOGIC_AVAILABLE:
    display(export_button)
    display(download_links)
    display(export_output)

# Final summary
display(HTML('''
<div style="background-color: #f8f9fa; padding: 20px; border-radius: 5px; margin: 20px 0; border-left: 4px solid #28a745;">
    <h3>🎉 Processing Complete!</h3>
    <p><strong>Your master dataset is ready!</strong> The consolidated file contains all job data from your uploaded CSV files, cleaned and standardized for analysis.</p>
    
    <h4>📁 Files Generated:</h4>
    <ul>
        <li><strong>Master CSV:</strong> Cleaned and merged dataset ready for analysis</li>
        <li><strong>Processing Report:</strong> Detailed information about data quality and transformations</li>
        <li><strong>Session Log:</strong> Technical log of all processing steps</li>
    </ul>
    
    <h4>🔄 Next Steps:</h4>
    <ul>
        <li>Review the processing report for data quality insights</li>
        <li>Import the master CSV into your preferred analysis tool</li>
        <li>Set up automated processing for regular data updates</li>
    </ul>
</div>
'''))