# Comprehensive and Interactive ETL Pipeline (Corrected)

This notebook provides a user-friendly interface to a powerful and comprehensive data cleaning engine. 
**Instructions:** Run the cells in order from top to bottom.

### 1. Package Validation

In [None]:
import sys, importlib.util, importlib.metadata
from IPython.display import display, HTML
print("Performing package validation...")
required = {'pandas': '1.3.5', 'numpy': '1.21.5', 'ipywidgets': '7.6.5'}
all_ok = True
for pkg, ver in required.items():
    if importlib.util.find_spec(pkg) is not None:
        display(HTML(f"✅ <b>{pkg}</b>: Installed (Version: {importlib.metadata.version(pkg)}))"))
    else:
        all_ok = False
        display(HTML(f'❌ <b>{pkg}</b>: Not found. Please install.'))
if not all_ok:
    display(HTML('<b style=\"color:red;\">❌ Missing packages. Please install them.</b>'))

### 2. Setup Pipeline

In [None]:
print("Importing ETL logic and setting up environment...")
try:
    from etl_pipeline_logic import assess_raw_data, clean_csv_data, generate_cleaning_report
    import ipywidgets as widgets
    import pandas as pd
    import io
    from functools import reduce
    print("✅ ETL engine imported successfully.")
    pipeline_ready = True
except ImportError as e:
    print(f"❌ Critical import error: {e}. Ensure 'etl_pipeline_logic.py' is in the same directory.")
    pipeline_ready = False

### 3. Upload Data

In [None]:
if pipeline_ready:
    uploader = widgets.FileUpload(accept='.csv', multiple=True, description='Upload CSVs')
    display(uploader)
else:
    print("Cannot proceed. Pipeline setup failed in the previous step.")

### 4. Assess Raw Data

In [None]:
if 'uploader' in locals() and uploader.value:
    assessments = {}
    for file_upload in uploader.value:
        file_name = file_upload['name']
        file_content = file_upload['content']
        assessments[file_name] = assess_raw_data(file_content, file_name)
    print("\nAssessment phase complete. See 'etl_cleaning_log.txt' for detailed logs.")
else:
    print("Please upload files in the previous step before assessing.")

### 5. Clean and Merge Data

In [None]:
if 'uploader' in locals() and uploader.value:
    cleaned_dfs = []
    original_dfs = {}
    print("Starting data cleaning process...")
    
    for file_upload in uploader.value:
        file_name = file_upload['name']
        file_content = file_upload['content']
        
        original_dfs[file_name] = pd.read_csv(io.BytesIO(file_content), on_bad_lines='skip')
        
        assessment = assessments.get(file_name)
        cleaned_df = clean_csv_data(file_content, file_name, assessment)
        
        if cleaned_df is not None and 'job_id' in cleaned_df.columns and cleaned_df['job_id'].notna().any():
            cleaned_dfs.append(cleaned_df)
        else:
            print(f"⚠️ Warning: Could not clean or find job_ids in '{file_name}'. It will be excluded from the merge.")

    final_cleaned_df = None
    if not cleaned_dfs:
        print("❌ No valid dataframes with job_ids were produced. Merge step skipped.")
    elif len(cleaned_dfs) == 1:
        final_cleaned_df = cleaned_dfs[0]
        print("✅ Only one valid dataframe. No merge needed.")
    else:
        try:
            final_cleaned_df = reduce(lambda left, right: pd.merge(left, right, on='job_id', how='outer'), cleaned_dfs)
            print(f"✅ Successfully merged {len(cleaned_dfs)} dataframes into a final dataset with shape {final_cleaned_df.shape}.")
        except Exception as e:
            print(f"❌ Error during merging: {e}")
else:
    print("Please upload files first.")

### 6. Generate Final Report

In [None]:
if 'final_cleaned_df' in locals() and final_cleaned_df is not None:
    # For the final report, we can't easily tie back to original_dfs after a merge.
    # We will generate a report on the final merged dataframe itself.
    print("\n--- FINAL REPORT ON MERGED DATA ---")
    report = generate_cleaning_report(final_cleaned_df, final_cleaned_df, 'merged_dataset') # Simplified report
    print(f"  Final Rows: {report['cleaned_dataset']['rows']}")
    print(f"  Final Columns: {report['cleaned_dataset']['columns']}")
    print(f"  Memory Usage: {report['cleaned_dataset']['memory_usage_mb']:.2f} MB")
    print("\nSee 'etl_cleaning_log.txt' for full details.")
else:
    print("No cleaned data available to report on.")