# AIHC 5615 — Homework 5: King County House Prices
_Autogenerated on 2025-11-10 18:17:51_

This notebook implements Problems 1–5. Update `DATA_PATH` to your training CSV.

In [None]:

import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def rmse(y_true, y_pred): return np.sqrt(mean_squared_error(y_true, y_pred))

DATA_PATH = r'C:\Users\M298134\Desktop\AIHC 5615\Week 1\data\kc_house_data.csv'  # <-- update
if not os.path.exists(DATA_PATH): print('WARNING: update DATA_PATH'); 
df_full = pd.read_csv(DATA_PATH)
df_train, df_test = train_test_split(df_full, test_size=0.2, random_state=42)
print('Train shape:', df_train.shape, ' Test shape:', df_test.shape)


## Problem 1 — Interactions

In [None]:

continuous_var, categorical_var = 'sqft_living', 'waterfront'
df_train[categorical_var] = df_train[categorical_var].astype('category')
m0 = smf.ols(f'price ~ {continuous_var} + C({categorical_var})', data=df_train).fit()
print(m0.summary()); print('\nCoefficients:\n', m0.params)
m1 = smf.ols(f'price ~ {continuous_var} * C({categorical_var})', data=df_train).fit()
print('\nWith interaction:\n', m1.summary())
print('\nCompare:\n', pd.DataFrame({'no_interaction': m0.params.reindex(m1.params.index, fill_value=np.nan),'with_interaction': m1.params}))
import numpy as np
xg = np.linspace(df_train[continuous_var].min(), df_train[continuous_var].max(), 200)
levels = df_train[categorical_var].cat.categories
params = m1.params; b0 = params.get('Intercept',0.); b1 = params.get(continuous_var,0.)
plt.figure()
for lvl in levels:
    mask = df_train[categorical_var]==lvl
    plt.scatter(df_train.loc[mask, continuous_var], df_train.loc[mask,'price'], alpha=0.35, label=f'{categorical_var}={lvl}')
plt.plot(xg, b0 + b1*xg, label=f'Fit: {categorical_var}={levels[0]}')
for lvl in levels[1:]:
    bc = params.get(f'C({categorical_var})[T.{lvl}]',0.); bi = params.get(f'{continuous_var}:C({categorical_var})[T.{lvl}]',0.)
    plt.plot(xg, b0 + b1*xg + bc + bi*xg, label=f'Fit: {categorical_var}={lvl}')
plt.xlabel(continuous_var); plt.ylabel('price'); plt.title('Interaction lines'); plt.legend(); plt.show()


## Problem 2 — Log Transforms

In [None]:

plt.figure(); plt.hist(df_train['price'].dropna(), bins=50); plt.title('price'); plt.show()
plt.figure(); plt.hist(np.log(df_train['price'].dropna()), bins=50); plt.title('log(price)'); plt.show()
print('Skew price:', stats.skew(df_train['price'].dropna()))
print('Skew log(price):', stats.skew(np.log(df_train['price'].dropna())))


In [None]:

num_predictors = ['sqft_living','sqft_lot','bedrooms','bathrooms']
for col in num_predictors:
    s = df_train[col].dropna()
    plt.figure(); plt.hist(s, bins=50); plt.title(col); plt.show()
    if (s>0).all(): plt.figure(); plt.hist(np.log(s), bins=50); plt.title('log('+col+')'); plt.show()
    plt.figure(); plt.scatter(df_train[col], df_train['price'], alpha=0.3); plt.xlabel(col); plt.ylabel('price'); plt.title('price vs '+col); plt.show()


In [None]:

cont_vars, cat_vars = ['sqft_living','sqft_lot','bedrooms','bathrooms'], ['view','condition','grade']
for c in cat_vars: df_train[c]=df_train[c].astype('category')
m1 = smf.ols('price ~ ' + ' + '.join(cont_vars + [f'C({c})' for c in cat_vars]), data=df_train).fit()
print(m1.summary()); print('RMSE M1:', rmse(df_train['price'], m1.fittedvalues))
df_train = df_train.copy(); df_train['log_price']=np.log(df_train['price'])
safe_log = lambda s: np.log(s.clip(lower=1))
df_train['log_sqft_living']=safe_log(df_train['sqft_living']); df_train['log_sqft_lot']=safe_log(df_train['sqft_lot'])
m2 = smf.ols('log_price ~ log_sqft_living + log_sqft_lot + bedrooms + bathrooms + ' + ' + '.join([f'C({c})' for c in cat_vars]), data=df_train).fit()
print(m2.summary())
pred_price = np.exp(m2.fittedvalues); print('RMSE M2 (back-transform):', rmse(df_train['price'], pred_price))
print(pd.DataFrame({'R2':[m1.rsquared, m2.rsquared],'RMSE':[rmse(df_train['price'], m1.fittedvalues), rmse(df_train['price'], pred_price)]}, index=['M1','M2']))


In [None]:

resid = df_train['log_price'] - m2.fittedvalues
plt.figure(); plt.scatter(m2.fittedvalues, resid, alpha=0.3); plt.axhline(0, ls='--'); plt.xlabel('Fitted (log-price)'); plt.ylabel('Residuals'); plt.title('Residual plot'); plt.show()


## Problem 3 — Feature Engineering

In [None]:

plt.figure(); plt.scatter(df_train['long'], df_train['lat'], c=np.log(df_train['price']), alpha=0.4); plt.xlabel('long'); plt.ylabel('lat'); plt.title('long vs lat (color=log price)'); plt.colorbar(label='log(price)'); plt.show()
p_lat, p_long = 47.63, -122.22
df_train['r'] = np.sqrt((df_train['lat']-p_lat)**2 + (df_train['long']-p_long)**2); print(df_train['r'].describe())
mr = smf.ols('log_price ~ r', data=df_train).fit(); print(mr.summary())
xg = np.linspace(df_train['r'].min(), df_train['r'].max(), 200); yg = mr.params['Intercept'] + mr.params['r']*xg
plt.figure(); plt.scatter(df_train['r'], df_train['log_price'], alpha=0.3); plt.plot(xg, yg); plt.xlabel('r'); plt.ylabel('log(price)'); plt.title('log(price) ~ r'); plt.show()


## Problem 4 — Standardization

In [None]:

target='log_price'
needed=['price','log_price','lat','long','r','log_sqft_living','log_sqft_lot','bedrooms','bathrooms','view','condition','grade']
work = df_train[needed].dropna().copy(); print('Work shape:', work.shape)
numeric_features=['log_sqft_living','log_sqft_lot','bedrooms','bathrooms','r']; categorical_features=['view','condition','grade']
pre = ColumnTransformer([('num', StandardScaler(), numeric_features), ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)])
pipe = Pipeline([('pre', pre), ('linreg', LinearRegression())])
X = work[numeric_features+categorical_features].copy(); y = work[target].copy()
pipe.fit(X,y); yhat=pipe.predict(X); print('Baseline R2:', r2_score(y,yhat), ' RMSE:', rmse(y,yhat))


In [None]:

X2 = X.copy(); gd = pd.get_dummies(X2['grade'], prefix='grade', drop_first=True)
for col in gd.columns: X2['log_sqft_living_x_'+col]=X2['log_sqft_living']*gd[col]
extra=[c for c in X2.columns if c.startswith('log_sqft_living_x_')]
numeric_all=['log_sqft_living','log_sqft_lot','bedrooms','bathrooms','r']+extra
pre2 = ColumnTransformer([('num', StandardScaler(), numeric_all), ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), ['view','condition','grade'])])
pipe2 = Pipeline([('pre', pre2), ('linreg', LinearRegression())]); pipe2.fit(X2,y); y2=pipe2.predict(X2)
print('With interaction R2:', r2_score(y,y2), ' RMSE:', rmse(y,y2))


## Problem 5 — Variable Selection

In [None]:

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import make_scorer
def neg_rmse(y_true, y_pred): return -rmse(y_true,y_pred)
rmse_scorer = make_scorer(neg_rmse)
pre2.fit(X2,y); X_full = pre2.transform(X2); y_full=y.copy()
base = LinearRegression(); direction='forward'  # or 'backward'
results=[]
for k in range(1, X_full.shape[1]+1):
    sfs = SequentialFeatureSelector(base, n_features_to_select=k, direction=direction, scoring=rmse_scorer, cv=5)
    sfs.fit(X_full, y_full); idx=np.where(sfs.get_support())[0]
    m = LinearRegression().fit(X_full[:,idx], y_full); yp=m.predict(X_full[:,idx])
    results.append({'k':k,'R2':r2_score(y_full,yp),'RMSE':rmse(y_full,yp)})
import pandas as pd
df_results = pd.DataFrame(results); display(df_results.head())
best = df_results.loc[df_results['R2'].idxmax()]; print('Best k:', int(best.k),' R2:',best.R2,' RMSE:',best.RMSE)
plt.figure(); plt.plot(df_results.k, df_results.R2, marker='o'); plt.xlabel('k'); plt.ylabel('R2'); plt.title('R2 vs k'); plt.grid(True); plt.show()
plt.figure(); plt.plot(df_results.k, df_results.RMSE, marker='o'); plt.xlabel('k'); plt.ylabel('RMSE'); plt.title('RMSE vs k'); plt.grid(True); plt.show()
