# AIHC 5615 — Homework 5: King County House Prices
_Autogenerated on 2025-11-10 17:17:01_


This notebook follows the assignment prompts exactly. 
It uses **pandas**, **numpy**, **matplotlib**, **scipy**, **statsmodels**, and **scikit-learn** (for standardization).
Update the `DATA_PATH` cell below to point to your **training** portion of the King County dataset.
If you only have the full dataset (e.g., `kc_house_data.csv`), the notebook will create a train/test split and use the **train** set by default.


In [None]:

# ==========================
# Setup & Data Loading
# ==========================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Show plots inline if in Jupyter
# (No special style/colors per your class rules)
# %matplotlib inline  # Uncomment if running in classic Jupyter

# ---- UPDATE THIS PATH ----
# Set this to the CSV file of your dataset (either training set or full dataset).
DATA_PATH = r'C:\Users\M298134\Desktop\AIHC 5615\Week 1\data\kc_house_data.csv'  # <-- change if needed

if not os.path.exists(DATA_PATH):
    print('WARNING: DATA_PATH does not exist. Update the path to your data file.')
else:
    print('Found data file at:', DATA_PATH)

# Load
df_full = pd.read_csv(DATA_PATH)

# Typical columns in the King County dataset:
# 'price','bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view',
# 'condition','grade','sqft_above','sqft_basement','yr_built','yr_renovated',
# 'zipcode','lat','long','sqft_living15','sqft_lot15'

# If you already have a training subset, put it in df_train directly.
# Otherwise, split and use the training portion per instructions.
df_train, df_test = train_test_split(df_full, test_size=0.2, random_state=42)
print('Train shape:', df_train.shape, ' Test shape:', df_test.shape)



## Problem 1 — Interactions
**(A)** Select one of your four continuous predictors and one of your three categorical predictors. Fit a multiple linear regression for **price** on those two variables. Report the fitted coefficients.


In [None]:

# ==========================
# Problem 1A
# ==========================

# Choose ONE continuous and ONE categorical predictor here.
# You can change these if your chosen set is different.
continuous_var = 'sqft_living'   # example continuous predictor
categorical_var = 'waterfront'   # example categorical (0/1)

# Ensure categorical is treated as category
df_train[categorical_var] = df_train[categorical_var].astype('category')

# Model without interaction
formula_no_inter = f'price ~ {continuous_var} + C({categorical_var})'
model_no_inter = smf.ols(formula=formula_no_inter, data=df_train).fit()
print('Model without interaction:')
print(model_no_inter.summary())

print('\nFitted coefficients (no interaction):')
print(model_no_inter.params)



**(B)** Now add an interaction term to that model. Compare the coefficients and explain the meaning of the interaction term in context. Do you think it's useful here?


In [None]:

# ==========================
# Problem 1B
# ==========================
formula_inter = f'price ~ {continuous_var} * C({categorical_var})'
model_inter = smf.ols(formula=formula_inter, data=df_train).fit()
print('Model with interaction:')
print(model_inter.summary())

print('\nCoefficient comparison:')
coef_compare = pd.DataFrame({
    'no_interaction': model_no_inter.params.reindex(model_inter.params.index, fill_value=np.nan),
    'with_interaction': model_inter.params
})
print(coef_compare)

print('\nInterpretation helper:')
print(f"""
- The interaction term modifies the slope of {continuous_var} for different levels of {categorical_var}.
- If the interaction coefficient is positive, the effect (slope) of {continuous_var} on price is larger for the indicated level of {categorical_var}, and vice versa.
""")



**(C)** Make a scatterplot of **price** against the continuous predictor and add **two regression lines** (one per level of the categorical variable) from the interaction model.


In [None]:

# ==========================
# Problem 1C
# ==========================
levels = df_train[categorical_var].cat.categories

plt.figure()
for lvl in levels:
    mask = df_train[categorical_var] == lvl
    plt.scatter(df_train.loc[mask, continuous_var], df_train.loc[mask, 'price'], alpha=0.4, label=f'{categorical_var}={lvl}')
    
# Compute lines using model_inter params:
# price = b0 + b1*x + b2*I(lvl) + b3*x*I(lvl)
x_grid = np.linspace(df_train[continuous_var].min(), df_train[continuous_var].max(), 100)

params = model_inter.params
b0 = params.get('Intercept', 0.0)
b1 = params.get(continuous_var, 0.0)
# For categorical treatment coding, level names appear as C(var)[T.level]
for lvl in levels[1:]:  # reference is the first category; others get T.lvl
    b_cat = params.get(f'C({categorical_var})[T.{lvl}]', 0.0)
    b_int = params.get(f'{continuous_var}:C({categorical_var})[T.{lvl}]', 0.0)
    y = b0 + b1*x_grid + b_cat + b_int*x_grid
    plt.plot(x_grid, y, label=f'Fit line: {categorical_var}={lvl}')
# Add line for reference level (no cat increment, no interaction increment)
y_ref = b0 + b1*x_grid
plt.plot(x_grid, y_ref, label=f'Fit line: {categorical_var}={levels[0]}')

plt.xlabel(continuous_var)
plt.ylabel('price')
plt.legend()
plt.title('Price vs. ' + continuous_var + ' with interaction-based lines')
plt.show()



## Problem 2 — Log Transforms
**(A)** Decide if it makes sense to log-transform the response (**price**). Explain with graphs or statistics.


In [None]:

# ==========================
# Problem 2A
# ==========================
# Visualize price vs. log(price)
fig = plt.figure()
plt.hist(df_train['price'].dropna(), bins=50)
plt.title('Histogram of price')
plt.show()

fig = plt.figure()
plt.hist(np.log(df_train['price'].dropna()), bins=50)
plt.title('Histogram of log(price)')
plt.show()

# Skewness/kurtosis
print('Skewness price:', stats.skew(df_train['price'].dropna()))
print('Skewness log(price):', stats.skew(np.log(df_train['price'].dropna())))



**(B)** Consider each of your **four chosen numerical predictors**. Is a log-transform reasonable for any? Explain.
(The cell below provides a helper to inspect skew and simple scatterplots. Update the list to your four numerical predictors.)


In [None]:

# ==========================
# Problem 2B
# ==========================
num_predictors = ['sqft_living', 'sqft_lot', 'bedrooms', 'bathrooms']  # <-- change to your chosen 4

for col in num_predictors:
    series = df_train[col].dropna()
    fig = plt.figure()
    plt.hist(series, bins=50)
    plt.title(f'Histogram of {col}')
    plt.show()
    
    if (series > 0).all():
        fig = plt.figure()
        plt.hist(np.log(series), bins=50)
        plt.title(f'Histogram of log({col})')
        plt.show()
    else:
        print(f'Skipping log({col}) histogram because of non-positive values.')
    
    # Quick scatter vs price
    fig = plt.figure()
    plt.scatter(df_train[col], df_train['price'], alpha=0.3)
    plt.xlabel(col); plt.ylabel('price'); plt.title(f'price vs {col}')
    plt.show()



**(C)** Fit two multiple linear regressions for price based on **all seven** of your predictors (continuous + categorical).
- Model 1: untransformed variables  
- Model 2: apply the transformations you chose in (A) and (B)
Compare R² and RMSE. Which fits better?


In [None]:

# ==========================
# Problem 2C
# ==========================
# Choose your 7 predictors here. Example selection:
cont_vars = ['sqft_living', 'sqft_lot', 'bedrooms', 'bathrooms']    # 4 continuous
cat_vars  = ['view', 'condition', 'grade']                           # 3 categorical (treated as categorical below)

# Ensure categorical dtype
for c in cat_vars:
    df_train[c] = df_train[c].astype('category')

# ---- Model 1: Untransformed price & predictors ----
formula1 = 'price ~ ' + ' + '.join(cont_vars + [f'C({c})' for c in cat_vars])
m1 = smf.ols(formula=formula1, data=df_train).fit()
print('Model 1 (untransformed) summary:')
print(m1.summary())

# RMSE on train
rmse1 = mean_squared_error(df_train['price'], m1.fittedvalues, squared=False)
print('Train RMSE (Model 1):', rmse1)

# ---- Model 2: Transformed based on choices ----
# Example choices (adjust as needed):
# - Use log(price)
# - log(sqft_living), log(sqft_lot) if positive
df_train = df_train.copy()
df_train['log_price'] = np.log(df_train['price'])

def safe_log(s):
    return np.log(s.clip(lower=1))  # avoid log(0)
df_train['log_sqft_living'] = safe_log(df_train['sqft_living'])
df_train['log_sqft_lot']    = safe_log(df_train['sqft_lot'])

# Keep bedrooms, bathrooms untransformed for this example
formula2 = 'log_price ~ log_sqft_living + log_sqft_lot + bedrooms + bathrooms + ' + ' + '.join([f'C({c})' for c in cat_vars])
m2 = smf.ols(formula=formula2, data=df_train).fit()
print('\nModel 2 (transformed) summary:')
print(m2.summary())

# Compare on train (RMSE for log-price back on price scale via exp of fitted for rough comparison)
pred_log = m2.fittedvalues
pred_price_m2 = np.exp(pred_log)
rmse2 = mean_squared_error(df_train['price'], pred_price_m2, squared=False)
print('Train RMSE (Model 2, back-transformed):', rmse2)

print('\nComparison:')
print(pd.DataFrame({'R2':[m1.rsquared, m2.rsquared], 'RMSE':[rmse1, rmse2]}, index=['Model 1 (price)','Model 2 (log-price)']))



**(D)** Make a **residual plot** for the model with transformed variables. Comment on the fit.


In [None]:

# ==========================
# Problem 2D
# ==========================
resid = df_train['log_price'] - m2.fittedvalues

plt.figure()
plt.scatter(m2.fittedvalues, resid, alpha=0.3)
plt.axhline(0, linestyle='--')
plt.xlabel('Fitted (log-price)')
plt.ylabel('Residuals')
plt.title('Residual plot — transformed model')
plt.show()



## Problem 3 — Feature Engineering (Location)
**(A)** Scatterplot of **longitude vs latitude** colored by price (or log price). Explain what it shows.


In [None]:

# ==========================
# Problem 3A
# ==========================
plt.figure()
plt.scatter(df_train['long'], df_train['lat'], c=np.log(df_train['price']), alpha=0.4)
plt.xlabel('longitude'); plt.ylabel('latitude')
plt.title('Longitude vs Latitude (color = log(price))')
plt.colorbar(label='log(price)')
plt.show()



**(B)** Create a **radial distance** feature from the point with maximum prices (approx): latitude 47.63, longitude -122.22.
Use: \( r = \sqrt{(x - p)^2 + (y - q)^2} \) where \((x,y)=(\text{lat},\text{long})\), \((p,q)=(47.63, -122.22)\).


In [None]:

# ==========================
# Problem 3B
# ==========================
p_lat, p_long = 47.63, -122.22
df_train['r'] = np.sqrt((df_train['lat'] - p_lat)**2 + (df_train['long'] - p_long)**2)
print(df_train['r'].describe())



**(C)** Fit a simple linear regression for **price (or log price)** on `r`. Plot the scatter with regression line. Comment on usefulness.


In [None]:

# ==========================
# Problem 3C
# ==========================
# We'll use log-price for stability
model_r = smf.ols('log_price ~ r', data=df_train).fit()
print(model_r.summary())

# Scatter + line
plt.figure()
plt.scatter(df_train['r'], df_train['log_price'], alpha=0.3)
xg = np.linspace(df_train['r'].min(), df_train['r'].max(), 200)
yg = model_r.params['Intercept'] + model_r.params['r'] * xg
plt.plot(xg, yg)
plt.xlabel('r (radial distance)')
plt.ylabel('log(price)')
plt.title('log(price) vs r with fitted line')
plt.show()



## Problem 4 — Standardization & Model Building
**Goal:** Build the best possible model using your selected predictors (the seven from before), the engineered `r`, any interactions you wish, and your chosen transformations.


In [None]:

# ==========================
# Problem 4A — Choose Predictors & Build DataFrame
# ==========================
# You can adjust this list as desired.
target = 'log_price'  # or 'price'

# Start with earlier chosen variables and add engineered r
selected_cont = ['log_sqft_living', 'log_sqft_lot', 'bedrooms', 'bathrooms', 'r']  # transformed where chosen
selected_cat  = ['view', 'condition', 'grade']

# Optionally add an interaction term name to include later (sklearn will handle via feature crosses if we create them)
# For demonstration, we'll add an interaction between log_sqft_living and grade (treated categorically via one-hot).
interaction_pairs = [('log_sqft_living', 'grade')]

# Make a working frame with required columns
needed = ['price','log_price','lat','long','r','log_sqft_living','log_sqft_lot','bedrooms','bathrooms','view','condition','grade']
work = df_train[needed].dropna().copy()

print('Working data shape:', work.shape)
work.head()


In [None]:

# ==========================
# Problem 4 — Standardized Linear Model (sklearn)
# ==========================
# We'll create explicit interaction features by multiplying columns AFTER encoding.
# Pipeline:
#   - OneHotEncode categorical
#   - Standardize numeric
#   - LinearRegression

numeric_features = ['log_sqft_living', 'log_sqft_lot', 'bedrooms', 'bathrooms', 'r']
categorical_features = ['view', 'condition', 'grade']

pre = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
])

linreg = LinearRegression()

pipe = Pipeline([('pre', pre), ('linreg', linreg)])

X = work[numeric_features + categorical_features].copy()
y = work[target].copy()

# Fit baseline (no custom interaction terms yet)
pipe.fit(X, y)
y_hat = pipe.predict(X)

print('Sklearn baseline (standardized)')
print('R^2:', r2_score(y, y_hat))
print('RMSE:', mean_squared_error(y, y_hat, squared=False))


In [None]:

# ==========================
# Problem 4 — Add an example interaction (log_sqft_living x grade)
# ==========================
# Create interaction columns manually on X with one-hot of grade.
X2 = X.copy()
# One-hot grade for manual interactions
grade_dummies = pd.get_dummies(X2['grade'], prefix='grade', drop_first=True)
for col in grade_dummies.columns:
    X2[f'log_sqft_living_x_{col}'] = X2['log_sqft_living'] * grade_dummies[col]

# Update preprocessor to pass through these extra numeric interaction columns
extra_numeric = [c for c in X2.columns if c.startswith('log_sqft_living_x_')]
numeric_all = ['log_sqft_living', 'log_sqft_lot', 'bedrooms', 'bathrooms', 'r'] + extra_numeric

pre2 = ColumnTransformer([
    ('num', StandardScaler(), numeric_all),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
])

pipe2 = Pipeline([('pre', pre2), ('linreg', LinearRegression())])
pipe2.fit(X2, y)
y2_hat = pipe2.predict(X2)

print('Sklearn model with interaction (log_sqft_living x grade)')
print('R^2:', r2_score(y, y2_hat))
print('RMSE:', mean_squared_error(y, y2_hat, squared=False))



### Notes for your write-up
- Explain your variable choices and why each transformation was used.
- Interpret the interaction term(s) in the context of house prices.
- Comment on diagnostics (residuals, heteroscedasticity, nonlinearity).
- State which model you prefer based on R², RMSE, and visual checks.
