In [None]:
import pandas as pd

# Load data
df = pd.read_csv('../data/marketing_data.csv')

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# --- Income Cleaning ---
df['Income'] = df['Income'].astype(str).str.replace(r"[^\d.]", "", regex=True).str.strip()
df['Income'] = pd.to_numeric(df['Income'], errors='coerce')

# --- Date Cleaning ---
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], errors='coerce')  # Optional: add format='%d-%m-%Y'

# --- Category Review ---
print("Unique Education values:", df['Education'].unique())
print("Unique Marital Status values:", df['Marital_Status'].unique())

# --- Fill Missing Income ---
df['Income'] = df.groupby(['Education', 'Marital_Status'])['Income']\
                 .transform(lambda x: x.fillna(x.mean()))
df['Income'] = df['Income'].fillna(df['Income'].median())

# Final check
print(df['Income'].isnull().sum())  # Should be 0

# Save
df.to_csv('../data/cleaned_data.csv', index=False)
print("✅ Cleaned data saved with valid Income values")
