# ðŸ“‰ Regression Pipeline: House Price Prediction

Complete regression workflow for predicting house prices.

**Level**: Intermediate  
**Time Required**: ~40 minutes

In [None]:
import sys
sys.path.insert(0, '../../')

from data_science_master_system import *
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("âœ… Ready!")

In [None]:
# Load data
loader = DataLoader()
df = loader.read('../data/csv/house_prices.csv')
print(f"Dataset: {df.shape}")
df.head()

## 1. Data Analysis

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df['price'].hist(bins=30, ax=axes[0], color='steelblue')
axes[0].set_title('Price Distribution')
axes[0].set_xlabel('Price ($)')

np.log(df['price']).hist(bins=30, ax=axes[1], color='green')
axes[1].set_title('Log Price Distribution')
axes[1].set_xlabel('Log Price')

plt.tight_layout()
plt.show()

In [None]:
# Key correlations with price
numeric_df = df.select_dtypes(include=[np.number])
correlations = numeric_df.corr()['price'].drop('price').sort_values(ascending=False)

print("ðŸ“Š Top Correlations with Price:")
print(correlations.head(10))

## 2. Feature Engineering

In [None]:
# Create new features
df_feat = df.copy()

# Price per sqft
df_feat['price_per_sqft'] = df['price'] / df['sqft_living']

# Property age
df_feat['age'] = 2024 - df['year_built']

# Renovation flag
df_feat['is_renovated'] = (df['year_renovated'] > 0).astype(int)

# Luxury indicators
df_feat['is_waterfront'] = df['waterfront']
df_feat['is_high_grade'] = (df['grade'] >= 9).astype(int)

# Rooms ratio
df_feat['bed_bath_ratio'] = df['bedrooms'] / (df['bathrooms'] + 0.5)

print(f"Features after engineering: {df_feat.shape[1]}")

In [None]:
# Prepare data
drop_cols = ['house_id', 'price', 'price_per_sqft']  # Drop target and derived
X = df_feat.drop(columns=drop_cols)
y = df_feat['price']

# Use log transform for target (more normal distribution)
y_log = np.log(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)
print(f"Training: {X_train.shape}")

## 3. Model Training and Comparison

In [None]:
from data_science_master_system.models.traditional.traditional_ml import RegressionModel

# Train multiple models
models = {
    'Random Forest': RegressionModel('random_forest', n_estimators=100),
    'Gradient Boosting': RegressionModel('gradient_boosting', n_estimators=100),
    'Ridge': RegressionModel('ridge'),
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    cv_scores = cross_val_score(model.underlying_model, X_train, y_train, cv=5, scoring='r2')
    results.append({
        'Model': name,
        'CV R2 Mean': cv_scores.mean(),
        'CV R2 Std': cv_scores.std()
    })

results_df = pd.DataFrame(results).sort_values('CV R2 Mean', ascending=False)
print("\nðŸ“Š Model Comparison:")
display(results_df)

## 4. Evaluate Best Model

In [None]:
# Use best model
best = models['Random Forest']

# Predict on test set
y_pred_log = best.predict(X_test)

# Transform back to original scale
y_pred = np.exp(y_pred_log)
y_true = np.exp(y_test)

# Calculate metrics
from data_science_master_system.evaluation.metrics import RegressionMetrics

metrics = RegressionMetrics.calculate(y_true, y_pred)

print("ðŸ“ˆ Test Set Performance:")
print(f"  RÂ² Score: {metrics['r2']:.4f}")
print(f"  RMSE: ${metrics['rmse']:,.0f}")
print(f"  MAE: ${metrics['mae']:,.0f}")
print(f"  MAPE: {metrics['mape']*100:.1f}%")

In [None]:
# Residual analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Predicted vs Actual
axes[0].scatter(y_true, y_pred, alpha=0.5)
axes[0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price')
axes[0].set_ylabel('Predicted Price')
axes[0].set_title('Predicted vs Actual')

# Residuals
residuals = y_true - y_pred
axes[1].hist(residuals, bins=30, color='steelblue', edgecolor='white')
axes[1].set_xlabel('Residual')
axes[1].set_title('Residual Distribution')

plt.tight_layout()
plt.show()

## 5. Feature Importance

In [None]:
importance = best.feature_importance(top_n=15)

plotter = Plotter()
fig = plotter.feature_importance(importance, title='Top 15 Features for Price Prediction')
plt.show()

In [None]:
# Save model
best.save('house_price_model.joblib')
print("âœ… Model saved!")

## ðŸŽ¯ Key Takeaways

1. Log transform for skewed targets
2. Domain-specific feature engineering
3. Cross-validation for robust evaluation
4. Residual analysis for diagnostics