In [1]:
import pandas as pd
import numpy as np

# Load raw data
df = pd.read_csv("data/raw_data.csv")

# Fix date type
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"])

# Impute missing 'income' using median
df["income"] = df["income"].fillna(df["income"].median())

# Replace invalid ages with median
df.loc[df["age"] < 0, "age"] = np.nan
df["age"] = df["age"].fillna(df["age"].median())

# Handle missing regions and gender with placeholder
df["region"] = df["region"].fillna("Unknown")
df["gender"] = df["gender"].fillna("Other")

# Outlier detection for 'expenses' using IQR
Q1 = df["expenses"].quantile(0.25)
Q3 = df["expenses"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df = df[(df["expenses"] >= lower) & (df["expenses"] <= upper)]

# Feature Engineering
df["income_per_age"] = df["income"] / df["age"]
df["log_expenses"] = np.log1p(df["expenses"])

# Encode categorical variables
df["region"] = df["region"].astype("category")
df["gender"] = df["gender"].astype("category")

# Save cleaned dataset
df.to_csv("data/cleaned_data.csv", index=False)
df.head()


Unnamed: 0,date,region,gender,age,income,expenses,income_per_age,log_expenses
0,2020-01-01,West,Female,42.0,47000.902271,29462.55491,1119.069102,10.290909
1,2020-01-02,Unknown,Male,44.0,51125.258026,38129.571124,1161.937682,10.548772
2,2020-01-03,East,Female,46.0,73133.839191,20459.382523,1589.866069,9.926246
3,2020-01-04,Unknown,Male,48.0,77067.456325,20463.95387,1605.572007,9.926469
4,2020-01-05,Unknown,Other,46.0,39713.936198,19307.49406,863.346439,9.8683
