In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'country', 'income']

df = pd.read_csv(url, names=columns, na_values='?')

print("Dataset Loaded Successfully!")
display(df.head())

Dataset Loaded Successfully!


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print(f"Numerical Columns: {list(numerical_cols)}")
print(f"Categorical Columns: {list(categorical_cols)}")

Numerical Columns: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical Columns: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'country', 'income']


In [4]:
le = LabelEncoder()
df['income_encoded'] = le.fit_transform(df['income'])

print("\n--- Label Encoding ---")
print("Original:", df['income'].unique())
print("Encoded:", df['income_encoded'].unique())


--- Label Encoding ---
Original: ['<=50K' '>50K']
Encoded: [0 1]


In [5]:
df_encoded = pd.get_dummies(df, columns=['sex', 'race'], drop_first=True)

print("\n--- One-Hot Encoding Top 5 Rows ---")
# Notice how 'sex' became 'sex_Male' (0 or 1)
display(df_encoded[['age', 'sex_Male', 'income_encoded']].head())
scaler = StandardScaler()

# Select features to scale
cols_to_scale = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'hours-per-week']

# Create a copy to keep original data for comparison
df_scaled = df_encoded.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df_scaled[cols_to_scale])


--- One-Hot Encoding Top 5 Rows ---


Unnamed: 0,age,sex_Male,income_encoded
0,39,True,0
1,50,True,0
2,38,True,0
3,53,True,0
4,28,False,0


In [6]:
print("\n--- Before Scaling (Raw Data) ---")
display(df_encoded[cols_to_scale].head(3))

print("\n--- After Scaling (Standardized) ---")
# Values will now be centered around 0 (Mean=0, Std=1)
display(df_scaled[cols_to_scale].head(3))
df_scaled.to_csv("processed_adult_data.csv", index=False)
print("\nProcessed dataset saved!")


--- Before Scaling (Raw Data) ---


Unnamed: 0,age,fnlwgt,education-num,capital-gain,hours-per-week
0,39,77516,13,2174,40
1,50,83311,13,0,13
2,38,215646,9,0,40



--- After Scaling (Standardized) ---


Unnamed: 0,age,fnlwgt,education-num,capital-gain,hours-per-week
0,0.025996,-1.061979,1.136512,0.146932,-0.034087
1,0.828308,-1.007104,1.136512,-0.144804,-2.213032
2,-0.046942,0.246034,-0.419335,-0.144804,-0.034087



Processed dataset saved!
