# Feature Engineering Notebook
This notebook creates model-ready features from the cleaned wage dataset.

In [12]:
import numpy as np
import pandas as pd

In [13]:
data_path = 'data/cleaned_wage_data.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,age,education_years,education_level,occupation,industry,hours_per_week,wage
0,25,16,Bachelors,Data Analyst,Technology,40,58000
1,31,18,Masters,Software Engineer,Technology,45,92000
2,42,12,High School,Machine Operator,Manufacturing,50,47000
3,37,14,Associate,Nurse,Healthcare,36,76000
4,29,16,Bachelors,Financial Analyst,Finance,42,81000


## Create Experience Proxy and Log Wage
Adjust column names below if your dataset uses different labels.

In [22]:
age_col = 'age'
education_years_col = 'education_years'
wage_col = 'wage'

if age_col in df.columns and education_years_col in df.columns:
    df['experience_proxy'] = df[age_col] - df[education_years_col]
    df['experience_proxy'] = df['experience_proxy'].clip(lower=0)

if wage_col in df.columns:
    df['log_wage'] = np.log1p(df[wage_col])

df[[col for col in ['experience_proxy', 'log_wage'] if col in df.columns]].head()

Unnamed: 0,experience_proxy,log_wage
0,9,10.968216
1,13,11.429555
2,30,10.757924
3,23,11.238502
4,13,11.302217


## Handle Sparse Categories

In [23]:
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
min_frequency = 0.01

for col in categorical_cols:
    freq = df[col].value_counts(normalize=True)
    rare_values = freq[freq < min_frequency].index
    df[col] = df[col].replace(rare_values, 'Other')

df[categorical_cols].nunique().sort_values(ascending=False).head(10)

occupation         10
industry            8
education_level     7
dtype: int64

## Encode Features

In [24]:
target_col = 'log_wage' if 'log_wage' in df.columns else 'wage'
feature_df = df.drop(columns=[target_col], errors='ignore')

encoded_df = pd.get_dummies(feature_df, drop_first=True)

if target_col in df.columns:
    encoded_df[target_col] = df[target_col]

print('Encoded shape:', encoded_df.shape)
encoded_df.head()

Encoded shape: (10, 28)


Unnamed: 0,age,education_years,hours_per_week,wage,experience_proxy,education_level_Bachelors,education_level_Doctorate,education_level_High School,education_level_Masters,education_level_Some College,...,occupation_Sales Representative,occupation_Software Engineer,industry_Finance,industry_Government,industry_Healthcare,industry_Manufacturing,industry_Professional Services,industry_Retail,industry_Technology,log_wage
0,25,16,40,58000,9,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,10.968216
1,31,18,45,92000,13,False,False,False,True,False,...,False,True,False,False,False,False,False,False,True,11.429555
2,42,12,50,47000,30,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,10.757924
3,37,14,36,76000,23,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,11.238502
4,29,16,42,81000,13,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,11.302217


In [26]:
output_path = 'data/engineered_wage_data.csv'
encoded_df.to_csv(output_path, index=False)
print(f'Saved engineered data to {output_path}')

Saved engineered data to data/engineered_wage_data.csv
