In [1]:
# ==========================================================
# DATA TRANSFORMATION (Adult dataset)
# File: C:\Users\abhin\Downloads\adult_with_headers (1).csv
# Tasks:
#  - load data, handle missing values ('?')
#  - label-encode binary columns (sex, income)
#  - one-hot encode multi-category columns
#  - bin age
#  - standardize and normalize selected numeric columns
#  - save transformed outputs
# ==========================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# ---------- 1) Load dataset ----------
path = r"C:\Users\abhin\Downloads\adult_with_headers (1).csv"
df = pd.read_csv(path)
print("Loaded:", path)
print("Shape before cleaning:", df.shape)
display(df.head())

# ---------- 2) Missing values ----------
# Replace '?' with NaN (common in adult dataset)
df.replace("?", np.nan, inplace=True)
print("\nMissing values (per column) BEFORE dropna():")
print(df.isnull().sum())

# Simple strategy: drop rows with any missing values (safe for assignment/demo)
df_clean = df.dropna().reset_index(drop=True)
print("\nShape after dropping missing rows:", df_clean.shape)

# ---------- 3) Column types (sanity) ----------
print("\nColumns and dtypes:")
print(df_clean.dtypes)

# ---------- 4) Identify categorical and numeric columns ----------
categorical_cols = df_clean.select_dtypes(include="object").columns.tolist()
numeric_cols = df_clean.select_dtypes(include=["int64","float64"]).columns.tolist()

print("\nCategorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

# ---------- 5) Label encode binary categorical columns ----------
# We'll label-encode 'sex' and 'income' (assuming income is binary like '<=50K' / '>50K')
label_cols = []
for col in ['sex', 'income']:
    if col in categorical_cols and df_clean[col].nunique() == 2:
        label_cols.append(col)

print("\nColumns chosen for label encoding (binary):", label_cols)
le = LabelEncoder()
for col in label_cols:
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))

# Remove encoded columns from categorical list for one-hot stage
one_hot_cols = [c for c in categorical_cols if c not in label_cols]
print("Columns chosen for one-hot encoding:", one_hot_cols)

# ---------- 6) One-hot encoding for remaining categorical columns ----------
df_encoded = pd.get_dummies(df_clean, columns=one_hot_cols, drop_first=True)
print("\nShape after one-hot encoding:", df_encoded.shape)
display(df_encoded.head())

# ---------- 7) Age binning ----------
# create an age bin column using original 'age' (numeric)
if 'age' in df_encoded.columns:
    df_encoded['Age_bin'] = pd.cut(df_encoded['age'],
                                   bins=[0,25,50,100],
                                   labels=['Young','Middle-aged','Senior'])
    print("\nAge_bin value counts:")
    print(df_encoded['Age_bin'].value_counts())

# ---------- 8) Scaling and normalization ----------
scale_cols = [c for c in ['age','hours_per_week','fnlwgt'] if c in df_encoded.columns]
print("\nColumns to scale/normalize:", scale_cols)

# Standardization
scaler = StandardScaler()
df_scaled = df_encoded.copy()
if scale_cols:
    df_scaled[scale_cols] = scaler.fit_transform(df_scaled[scale_cols])
    print("\nAfter Standard Scaling (sample):")
    display(df_scaled[scale_cols].head())

# Min-Max Normalization
minmax = MinMaxScaler()
df_normalized = df_encoded.copy()
if scale_cols:
    df_normalized[scale_cols] = minmax.fit_transform(df_normalized[scale_cols])
    print("\nAfter Min-Max Normalization (sample):")
    display(df_normalized[scale_cols].head())

# ---------- 9) Save outputs ----------
out_base = r"C:\Users\abhin\Downloads\adult_transformed"
df_encoded.to_csv(out_base + "_onehot.csv", index=False)
df_scaled.to_csv(out_base + "_standard_scaled.csv", index=False)
df_normalized.to_csv(out_base + "_minmax_normalized.csv", index=False)

print("\nSaved files:")
print(out_base + "_onehot.csv")
print(out_base + "_standard_scaled.csv")
print(out_base + "_minmax_normalized.csv")

# ---------- 10) Final preview ----------
print("\nFinal transformed data preview (one-hot + label-encoded):")
display(df_encoded.head())

print("\nAll transformations completed.")

Loaded: C:\Users\abhin\Downloads\adult_with_headers (1).csv
Shape before cleaning: (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K



Missing values (per column) BEFORE dropna():
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

Shape after dropping missing rows: (32561, 15)

Columns and dtypes:
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

Categorical columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
Numeric columns: ['age', 'fnlwgt', 'educati

Unnamed: 0,age,fnlwgt,education_num,sex,capital_gain,capital_loss,hours_per_week,income,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516,13,1,2174,0,40,0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,50,83311,13,1,0,0,13,0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,38,215646,9,1,0,0,40,0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,53,234721,7,1,0,0,40,0,False,False,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,13,0,0,0,40,0,False,False,...,False,False,False,False,False,False,False,False,False,False



Age_bin value counts:
Age_bin
Middle-aged    19690
Senior          6460
Young           6411
Name: count, dtype: int64

Columns to scale/normalize: ['age', 'hours_per_week', 'fnlwgt']

After Standard Scaling (sample):


Unnamed: 0,age,hours_per_week,fnlwgt
0,0.030671,-0.035429,-1.063611
1,0.837109,-2.222153,-1.008707
2,-0.042642,-0.035429,0.245079
3,1.057047,-0.035429,0.425801
4,-0.775768,-0.035429,1.408176



After Min-Max Normalization (sample):


Unnamed: 0,age,hours_per_week,fnlwgt
0,0.30137,0.397959,0.044302
1,0.452055,0.122449,0.048238
2,0.287671,0.397959,0.138113
3,0.493151,0.397959,0.151068
4,0.150685,0.397959,0.221488



Saved files:
C:\Users\abhin\Downloads\adult_transformed_onehot.csv
C:\Users\abhin\Downloads\adult_transformed_standard_scaled.csv
C:\Users\abhin\Downloads\adult_transformed_minmax_normalized.csv

Final transformed data preview (one-hot + label-encoded):


Unnamed: 0,age,fnlwgt,education_num,sex,capital_gain,capital_loss,hours_per_week,income,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,Age_bin
0,39,77516,13,1,2174,0,40,0,False,False,...,False,False,False,False,False,False,True,False,False,Middle-aged
1,50,83311,13,1,0,0,13,0,False,False,...,False,False,False,False,False,False,True,False,False,Middle-aged
2,38,215646,9,1,0,0,40,0,False,False,...,False,False,False,False,False,False,True,False,False,Middle-aged
3,53,234721,7,1,0,0,40,0,False,False,...,False,False,False,False,False,False,True,False,False,Senior
4,28,338409,13,0,0,0,40,0,False,False,...,False,False,False,False,False,False,False,False,False,Middle-aged



All transformations completed.
