In [29]:
# !pip install pandas scikit-learn ipywidgets

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import ipywidgets as widgets
from IPython.display import display, clear_output
import io


In [30]:
upload = widgets.FileUpload(accept='.csv', multiple=False)
upload_output = widgets.Output()

def handle_upload(change):
    upload_output.clear_output()
    global df, original_df
    file = list(upload.value.values())[0]
    content = file['content']
    df = pd.read_csv(io.BytesIO(content), na_values='?')  # handle '?' as NaN
    original_df = df.copy()
    with upload_output:
        print(f"✅ Loaded file with {df.shape[0]} rows and {df.shape[1]} columns.")
        display(df.head())

upload.observe(handle_upload, names='value')
display(widgets.HTML("<h3>📤 Upload your CSV file (adult.csv)</h3>"), upload, upload_output)


HTML(value='<h3>📤 Upload your CSV file (adult.csv)</h3>')

FileUpload(value={}, accept='.csv', description='Upload')

Output()

In [31]:
# Processing step toggles
fill_missing = widgets.Checkbox(value=True, description='Fill Missing Values')
drop_insert_col = widgets.Checkbox(value=True, description='Drop/Insert Columns')
label_output = widgets.Checkbox(value=True, description='Label Output Column')
feature_scaling = widgets.Checkbox(value=True, description='Feature Scaling')
encode_categorical = widgets.Checkbox(value=True, description='Convert Categorical to Numeric')

options_box = widgets.VBox([
    widgets.HTML("<h3>⚙️ Select Preprocessing Steps</h3>"),
    fill_missing,
    drop_insert_col,
    label_output,
    feature_scaling,
    encode_categorical
])

display(options_box)


VBox(children=(HTML(value='<h3>⚙️ Select Preprocessing Steps</h3>'), Checkbox(value=True, description='Fill Mi…

In [32]:
process_button = widgets.Button(description='🚀 Run Preprocessing', button_style='success')
process_output = widgets.Output()

def run_preprocessing(b):
    process_output.clear_output()
    global df_processed
    df_processed = original_df.copy()

    with process_output:
        print(f"🔍 Starting with {df_processed.shape[0]} rows and {df_processed.shape[1]} columns")

        if fill_missing.value:
            print("\n🧹 Filling missing values...")
            for col in df_processed.columns:
                if df_processed[col].dtype in ['float64', 'int64']:
                    df_processed[col].fillna(df_processed[col].mean(), inplace=True)
                else:
                    df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)
            print("✅ Missing values filled.")

        if drop_insert_col.value:
            print("\n🛠 Removing or inserting columns...")
            if 'fnlwgt' in df_processed.columns:
                df_processed.drop(columns=['fnlwgt'], inplace=True)
                print("🗑️ Dropped column: fnlwgt")
            if 'hours-per-week' in df_processed.columns and 'education-num' in df_processed.columns:
                df_processed['engagement_score'] = df_processed['hours-per-week'] * df_processed['education-num']
                print("➕ Inserted column: engagement_score")

        if label_output.value and 'income' in df_processed.columns:
            print("\n🏷 Labeling output column 'income'...")
            le = LabelEncoder()
            df_processed['income_label'] = le.fit_transform(df_processed['income'])
            print("✅ Labeled as 'income_label'")

        if feature_scaling.value:
            print("\n📏 Scaling numeric features...")
            numeric_cols = df_processed.select_dtypes(include=np.number).columns
            scaler = StandardScaler()
            df_processed[numeric_cols] = scaler.fit_transform(df_processed[numeric_cols])
            print("✅ Numeric features scaled.")

        if encode_categorical.value:
            print("\n🔄 Encoding categorical columns...")
            cat_cols = df_processed.select_dtypes(include='object').columns
            if len(cat_cols):
                df_processed = pd.get_dummies(df_processed, columns=cat_cols, drop_first=True)
                print(f"✅ Encoded columns: {list(cat_cols)}")
            else:
                print("ℹ️ No categorical columns found.")

        print("\n🎯 Final processed DataFrame:")
        display(df_processed.head())

process_button.on_click(run_preprocessing)
display(process_button, process_output)


Button(button_style='success', description='🚀 Run Preprocessing', style=ButtonStyle())

Output()

In [33]:
export_button = widgets.Button(description='💾 Export Processed CSV', button_style='info')
export_output = widgets.Output()

def export_file(b):
    export_output.clear_output()
    try:
        # Add Serial Number before exporting
        df_export = df_processed.copy()
        df_export.insert(0, 'serial_number', range(1, len(df_export) + 1))

        df_export.to_csv("processed_adult.csv", index=False)
        with export_output:
            print("✅ File saved successfully as 'processed_adult.csv' with serial numbers!")
            display(df_export.head())
    except NameError:
        with export_output:
            print("❌ You must run preprocessing first.")

export_button.on_click(export_file)
display(export_button, export_output)


Button(button_style='info', description='💾 Export Processed CSV', style=ButtonStyle())

Output()