# 02 — Imputation Strategies (Baseline vs Model)

We implement and compare two approaches:

1. **Baseline (Group Median)**: impute `lusage` within `(year, month, mozip)`; fallback to `(zipcode, month)`; fallback to global median.
2. **Model-based (Ridge Regression)**: predict `lusage` from `mozip`, `zipcode`, `month`, `size_sqft`, `children`, `owner`, `hhsize*`, `income*`, and `lusage1-6`.

We'll compare distributional shifts and holdout MAE/R² for the model.

In [None]:
import pandas as pd, numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt

csv_path = "../data/data_test.csv"
df = pd.read_csv(csv_path)

# Baseline
dfb = df.copy()
med1 = dfb.groupby(['year','month','mozip'])['lusage'].median()
med2 = dfb.groupby(['zipcode','month'])['lusage'].median()
global_med = dfb['lusage'].median()

def fill_row(r):
    if pd.notnull(r['lusage']): return r['lusage']
    v = med1.get((r['year'],r['month'],r['mozip']), np.nan)
    if pd.isna(v): v = med2.get((r['zipcode'],r['month']), np.nan)
    if pd.isna(v): v = global_med
    return v

dfb['lusage_imp'] = dfb.apply(fill_row, axis=1)

In [None]:
# Model
features = ['mozip','zipcode','month','size_sqft','children','owner','hhsize2','hhsize3','hhsize4','hhsize5plus',
            'income2','income3','income4','income5','income6','income7','income8','income9',
            'lusage1','lusage2','lusage3','lusage4','lusage5','lusage6']
mask = df['lusage'].notna()
X = df.loc[mask, features]
y = df.loc[mask, 'lusage']

num = ['month','size_sqft','lusage1','lusage2','lusage3','lusage4','lusage5','lusage6']
cat = ['mozip','zipcode','children','owner','hhsize2','hhsize3','hhsize4','hhsize5plus',
       'income2','income3','income4','income5','income6','income7','income8','income9']

pre = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat)
])

pipe = Pipeline([("prep", pre), ("model", Ridge(alpha=1.0))])

Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.2,random_state=42)
pipe.fit(Xtr,ytr)
pred = pipe.predict(Xte)
print("R2:", r2_score(yte,pred), "MAE:", mean_absolute_error(yte,pred))

# Impute
Xm = df.loc[~mask, features]
if len(Xm) > 0:
    df.loc[~mask,'lusage_imp_model'] = pipe.predict(Xm)

In [None]:
# Compare distributions before/after
import matplotlib.pyplot as plt
plt.figure()
df['lusage'].dropna().hist(bins=40, alpha=0.5, label='original')
df['lusage_imp'].hist(bins=40, alpha=0.5, label='baseline')
if 'lusage_imp_model' in df:
    df['lusage_imp_model'].hist(bins=40, alpha=0.5, label='model')
plt.legend()
plt.title("lusage distributions (original vs imputed)")
plt.show()