# Lab 5: Regression Analysis - House Price Prediction
## Interactive Notebook

### Learning Objectives

1. Implement linear regression
2. Create polynomial features
3. Apply regularization (Ridge, Lasso, ElasticNet)
4. Perform residual analysis
5. Compare and select best model

**Estimated Time:** 4-5 hours

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print('‚úÖ Libraries loaded!')

## Part 1: Load California Housing Data

In [None]:
# Load dataset
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['Price'] = housing.target

print(f'Dataset: {df.shape}')
print(f'\nFeatures: {list(housing.feature_names)}')
print(f'\nTarget: Median house value (in $100,000s)')
df.head()

### üìù Task 1: Exploratory Data Analysis

In [None]:
# TODO: Display basic statistics
# YOUR CODE HERE

# TODO: Check for missing values
# YOUR CODE HERE

# TODO: Create correlation matrix
plt.figure(figsize=(12, 10))
# YOUR CODE HERE
plt.title('Feature Correlation Matrix')
plt.show()

### üìù Task 2: Visualize Target Distribution

In [None]:
# TODO: Create histogram of house prices
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram
axes[0].hist(df['Price'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Price ($100,000s)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of House Prices')

# Q-Q plot
stats.probplot(df['Price'], dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot')

plt.tight_layout()
plt.show()

## Part 2: Data Preparation

In [None]:
# TODO: Split data
X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = # YOUR CODE HERE

# TODO: Scale features
scaler = StandardScaler()
X_train_scaled = # YOUR CODE HERE
X_test_scaled = # YOUR CODE HERE

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

## Part 3: Linear Regression (Baseline)

In [None]:
# TODO: Train linear regression
lr = LinearRegression()
# YOUR CODE HERE

# TODO: Make predictions
y_pred_lr = # YOUR CODE HERE

# TODO: Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae = mean_absolute_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)

print('Linear Regression Results:')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R²: {r2:.4f}')

### Residual Analysis

In [None]:
# TODO: Calculate residuals
residuals = y_test - y_pred_lr

# Plot residuals
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Residual plot
axes[0].scatter(y_pred_lr, residuals, alpha=0.5)
axes[0].axhline(0, color='red', linestyle='--')
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')

# Histogram of residuals
axes[1].hist(residuals, bins=50, edgecolor='black')
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Residuals')

# Q-Q plot
stats.probplot(residuals, dist="norm", plot=axes[2])
axes[2].set_title('Q-Q Plot of Residuals')

plt.tight_layout()
plt.show()

## Part 4: Polynomial Regression

In [None]:
# TODO: Create polynomial features (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = # YOUR CODE HERE
X_test_poly = # YOUR CODE HERE

print(f'Original features: {X_train_scaled.shape[1]}')
print(f'Polynomial features: {X_train_poly.shape[1]}')

# TODO: Train model with polynomial features
lr_poly = LinearRegression()
# YOUR CODE HERE


## Part 5: Regularization

### Ridge, Lasso, and ElasticNet

In [None]:
# TODO: Train Ridge regression
ridge = Ridge(alpha=1.0)
# YOUR CODE HERE

# TODO: Train Lasso regression
lasso = Lasso(alpha=0.1)
# YOUR CODE HERE

# TODO: Train ElasticNet
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
# YOUR CODE HERE

# Compare all models
models = {
    'Linear Regression': lr,
    'Polynomial (degree=2)': lr_poly,
    'Ridge': ridge,
    'Lasso': lasso,
    'ElasticNet': elastic
}

results = []
for name, model in models.items():
    # Predict based on model type
    if 'Polynomial' in name or 'Ridge' in name or 'Lasso' in name or 'Elastic' in name:
        X_test_use = X_test_poly if 'Polynomial' not in name else X_test_scaled
    else:
        X_test_use = X_test_scaled
    
    # Calculate metrics
    # YOUR CODE HERE
    
print(pd.DataFrame(results))

---

# üéØ Practice Questions

1. Which model performs best? Why?
2. What do the residual plots tell us?
3. When should you use regularization?
4. What's the difference between Ridge and Lasso?

---

# üìù Summary

‚úÖ Implemented linear regression
‚úÖ Created polynomial features
‚úÖ Applied regularization
‚úÖ Analyzed residuals
‚úÖ Compared model performance

**Excellent work on regression! üéâ**