# Assignment 9 – Data Preprocessing & Feature Engineering (Adult Dataset)

Dataset: Adult Census Income Dataset  
Goal: Practice missing value handling, scaling, encoding, and feature engineering.


In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder


In [5]:
# Load dataset
df = pd.read_csv(r"/content/adult_with_headers.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
# Basic exploration
print("Shape:", df.shape)
df.info()
df.describe(include="all").T


Shape: (32561, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,32561.0,,,,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
workclass,32561.0,9.0,Private,22696.0,,,,,,,
fnlwgt,32561.0,,,,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education,32561.0,16.0,HS-grad,10501.0,,,,,,,
education_num,32561.0,,,,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
marital_status,32561.0,7.0,Married-civ-spouse,14976.0,,,,,,,
occupation,32561.0,15.0,Prof-specialty,4140.0,,,,,,,
relationship,32561.0,6.0,Husband,13193.0,,,,,,,
race,32561.0,5.0,White,27816.0,,,,,,,
sex,32561.0,2.0,Male,21790.0,,,,,,,


In [7]:
# Handle missing values
# In Adult dataset, missing values are often marked as '?'
df.replace("?", np.nan, inplace=True)

print("Missing values per column:\n", df.isna().sum())

# Drop rows with missing target
df = df.dropna(subset=["income"])

# Impute categorical with mode, numerical with median
for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

print("Missing values after handling:\n", df.isna().sum())


Missing values per column:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64
Missing values after handling:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


## Scaling Numerical Features

In [8]:
# Scaling numerical features
num_cols = df.select_dtypes(include=np.number).columns.tolist()

std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

df_std_scaled = df.copy()
df_std_scaled[num_cols] = std_scaler.fit_transform(df[num_cols])

df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = minmax_scaler.fit_transform(df[num_cols])

df_std_scaled[num_cols].head(), df_minmax_scaled[num_cols].head()


(        age    fnlwgt  education_num  capital_gain  capital_loss  \
 0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
 1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
 2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
 3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
 4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   
 
    hours_per_week  
 0       -0.035429  
 1       -2.222153  
 2       -0.035429  
 3       -0.035429  
 4       -0.035429  ,
         age    fnlwgt  education_num  capital_gain  capital_loss  \
 0  0.301370  0.044302       0.800000       0.02174           0.0   
 1  0.452055  0.048238       0.800000       0.00000           0.0   
 2  0.287671  0.138113       0.533333       0.00000           0.0   
 3  0.493151  0.151068       0.400000       0.00000           0.0   
 4  0.150685  0.221488       0.800000       0.00000           0.0   
 
    hours_per_week  
 0        0.397959  

### Scaling Techniques – When to Use

**Standard Scaling**
- Centers data to mean 0 and std 1
- Preferred for algorithms assuming normal distribution (Logistic Regression, SVM, K-Means)

**Min-Max Scaling**
- Scales values between 0 and 1
- Preferred when features have fixed bounds or for neural networks


## Encoding Categorical Features

In [9]:
# Encoding techniques
cat_cols = df.select_dtypes(include="object").columns.tolist()
cat_cols.remove("income")

# Columns with <5 categories → One-Hot
low_card_cols = [c for c in cat_cols if df[c].nunique() < 5]
high_card_cols = [c for c in cat_cols if df[c].nunique() >= 5]

df_encoded = df.copy()
df_encoded = pd.get_dummies(df_encoded, columns=low_card_cols, drop_first=True)

# Label Encoding for high-cardinality columns
le = LabelEncoder()
for col in high_card_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

df_encoded.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,income,sex_ Male
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,<=50K,True
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,<=50K,True
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,<=50K,True
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,<=50K,True
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,<=50K,False


### Encoding Techniques – Pros & Cons

**One-Hot Encoding**
✔ No ordinal assumption  
✔ Works well for nominal categories  
✖ Increases dimensionality

**Label Encoding**
✔ Simple and memory-efficient  
✔ Useful for tree-based models  
✖ Introduces artificial ordering


## Feature Engineering

In [11]:
# Feature Engineering
# 1. Capital Gain Indicator
df_encoded["has_capital_gain"] = (df["capital_gain"] > 0).astype(int)

# 2. Age Bucket Feature
df_encoded["age_bucket"] = pd.cut(
    df["age"], bins=[0,25,45,65,100], labels=["Young","Adult","Senior","Elder"]
)

df_encoded["age_bucket"] = LabelEncoder().fit_transform(df_encoded["age_bucket"])

df_encoded[["has_capital_gain","age_bucket"]].head()

Unnamed: 0,has_capital_gain,age_bucket
0,1,0
1,0,2
2,0,0
3,0,2
4,0,0


In [13]:
# Log transformation on skewed feature
df_encoded["log_capital_gain"] = np.log1p(df["capital_gain"])

df_encoded[["capital_gain","log_capital_gain"]].head()

Unnamed: 0,capital_gain,log_capital_gain
0,2174,7.684784
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
