# Comprehensive and Interactive ETL Pipeline (Corrected)

This notebook provides a user-friendly interface to a powerful and comprehensive data cleaning engine.
**Instructions:** Run the cells in order from top to bottom.

### 1. Package Validation

In [1]:
import sys, importlib.util, importlib.metadata
from IPython.display import display, HTML
print("Performing package validation...")
required = {'pandas': '1.3.5', 'numpy': '1.21.5', 'ipywidgets': '7.6.5'}
all_ok = True
for pkg, ver in required.items():
    if importlib.util.find_spec(pkg) is not None:
        display(HTML(f"✅ <b>{pkg}</b>: Installed (Version: {importlib.metadata.version(pkg)}))"))
    else:
        all_ok = False
        display(HTML(f'❌ <b>{pkg}</b>: Not found. Please install.'))
if not all_ok:
    display(HTML('<b style=\"color:red;\">❌ Missing packages. Please install them.</b>'))

Performing package validation...


### 2. Setup Pipeline

In [7]:
print("Importing ETL logic and setting up environment...")
try:
    from etl_pipeline_logic import assess_raw_data, clean_csv_data, generate_cleaning_report
    import ipywidgets as widgets
    import pandas as pd
    import io
    from functools import reduce
    print("✅ ETL engine imported successfully.")
    pipeline_ready = True
except ImportError as e:
    print(f"❌ Critical import error: {e}. Ensure 'etl_pipeline_logic.py' is in the same directory.")
    pipeline_ready = False

Importing ETL logic and setting up environment...
✅ ETL engine imported successfully.


### 3. Upload Data

In [8]:
if pipeline_ready:
    uploader = widgets.FileUpload(accept='.csv', multiple=True, description='Upload CSVs')
    display(uploader)
else:
    print("Cannot proceed. Pipeline setup failed in the previous step.")

FileUpload(value={}, accept='.csv', description='Upload CSVs', multiple=True)

### 4. Assess Raw Data

In [13]:
if 'uploader' in locals() and uploader.value:
    assessments = {}
    # Iterate over the keys (which are integers) to access the uploaded files
    for file_index in uploader.value.keys():
        file_upload = uploader.value[file_index]
        file_name = file_upload['metadata']['name']
        file_content = file_upload['content']
        assessments[file_name] = assess_raw_data(file_content, file_name)
    print("\nAssessment phase complete. See 'etl_cleaning_log.txt' for detailed logs.")

    # Read and print the log file content
    try:
        with open('etl_cleaning_log.txt', 'r') as f:
            log_content = f.read()
            print("\n--- Content of etl_cleaning_log.txt ---")
            print(log_content)
            print("--- End of etl_cleaning_log.txt ---")
    except FileNotFoundError:
        print("\nError: etl_cleaning_log.txt not found.")

else:
    print("Please upload files in the previous step before assessing.")

2025-08-28 18:28:19,791 - INFO - ASSESSING: slc_data_parttwo_2024.csv
2025-08-28 18:28:19,806 - INFO - ASSESSING: slc_data_partone_2024.csv



Assessment phase complete. See 'etl_cleaning_log.txt' for detailed logs.

--- Content of etl_cleaning_log.txt ---
2025-08-28 18:27:48,354 - INFO - ASSESSING: slc_data_parttwo_2024.csv
2025-08-28 18:27:48,382 - INFO - ASSESSING: slc_data_partone_2024.csv
2025-08-28 18:28:19,791 - INFO - ASSESSING: slc_data_parttwo_2024.csv
2025-08-28 18:28:19,806 - INFO - ASSESSING: slc_data_partone_2024.csv

--- End of etl_cleaning_log.txt ---


### 5. Clean and Merge Data

In [15]:
if 'uploader' in locals() and uploader.value:
    cleaned_dfs = []
    original_dfs = {}
    print("Starting data cleaning process...")

    for file_index in uploader.value.keys():
        file_upload = uploader.value[file_index]
        file_name = file_upload['metadata']['name']
        file_content = file_upload['content']

        original_dfs[file_name] = pd.read_csv(io.BytesIO(file_content), on_bad_lines='skip')

        assessment = assessments.get(file_name)
        cleaned_df = clean_csv_data(file_content, file_name, assessment)

        if cleaned_df is not None and 'job_id' in cleaned_df.columns and cleaned_df['job_id'].notna().any():
            cleaned_dfs.append(cleaned_df)
        else:
            print(f"⚠️ Warning: Could not clean or find job_ids in '{file_name}'. It will be excluded from the merge.")

    final_cleaned_df = None
    if not cleaned_dfs:
        print("❌ No valid dataframes with job_ids were produced. Merge step skipped.")
    elif len(cleaned_dfs) == 1:
        final_cleaned_df = cleaned_dfs[0]
        print("✅ Only one valid dataframe. No merge needed.")
    else:
        try:
            final_cleaned_df = reduce(lambda left, right: pd.merge(left, right, on='job_id', how='outer'), cleaned_dfs)
            print(f"✅ Successfully merged {len(cleaned_dfs)} dataframes into a final dataset with shape {final_cleaned_df.shape}.")
        except Exception as e:
            print(f"❌ Error during merging: {e}")
else:
    print("Please upload files first.")

2025-08-28 18:28:50,644 - INFO - STARTING DATA CLEANING FOR: slc_data_parttwo_2024.csv
2025-08-28 18:28:50,677 - INFO - Loaded 1213 raw rows.
2025-08-28 18:28:50,739 - INFO - Removing 3 likely summary/header rows from data.
2025-08-28 18:28:50,753 - INFO - Found URL column: 'full_url'. Extracting job_id.
2025-08-28 18:28:50,773 - INFO - Successfully extracted 1210 job IDs.


Starting data cleaning process...


2025-08-28 18:28:51,007 - INFO - CLEANING COMPLETE for slc_data_parttwo_2024.csv. Final shape: (1210, 28)
2025-08-28 18:28:51,031 - INFO - STARTING DATA CLEANING FOR: slc_data_partone_2024.csv
2025-08-28 18:28:51,056 - INFO - Loaded 632 raw rows.
2025-08-28 18:28:51,095 - INFO - Found URL column: 'full_url'. Extracting job_id.
2025-08-28 18:28:51,100 - INFO - Successfully extracted 632 job IDs.
2025-08-28 18:28:51,254 - INFO - CLEANING COMPLETE for slc_data_partone_2024.csv. Final shape: (632, 22)


✅ Successfully merged 2 dataframes into a final dataset with shape (1842, 49).


### 6. Generate Final Report

In [17]:
if 'final_cleaned_df' in locals() and final_cleaned_df is not None:
    # For the final report, we can't easily tie back to original_dfs after a merge.
    # We will generate a report on the final merged dataframe itself.
    print("\n--- FINAL REPORT ON MERGED DATA ---")
    report = generate_cleaning_report(final_cleaned_df, final_cleaned_df, 'merged_dataset') # Simplified report
    print(f"  Final Rows: {report['cleaned_dataset']['rows']}")
    print(f"  Final Columns: {report['cleaned_dataset']['columns']}")
    # Check if 'improvements' key exists before accessing it
    if 'improvements' in report and 'rows_removed' in report['improvements']:
        print(f"  Rows Removed During Cleaning: {report['improvements']['rows_removed']}")
    print("\nSee 'etl_cleaning_log.txt' for full details.")
else:
    print("No cleaned data available to report on.")

2025-08-28 18:29:37,949 - INFO - GENERATING REPORT FOR: merged_dataset



--- FINAL REPORT ON MERGED DATA ---
  Final Rows: 1842
  Final Columns: 49
  Rows Removed During Cleaning: 0

See 'etl_cleaning_log.txt' for full details.


In [18]:
if 'final_cleaned_df' in locals() and final_cleaned_df is not None:
    output_filename = 'cleaned_merged_data.csv'
    final_cleaned_df.to_csv(output_filename, index=False)
    print(f"\n✅ Cleaned and merged data saved to '{output_filename}'")
else:
    print("\n❌ No cleaned and merged data available to save.")


✅ Cleaned and merged data saved to 'cleaned_merged_data.csv'
