# Housing Model Training Pipeline + Hyperparameter Tuners

## 1. Import Libraries
Import all necessary libraries for data processing, transformation, modeling, and evaluation.

In [3]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

## 2. Define Custom Transformer
The `ClusterSimilarity` transformer creates geographic cluster features using KMeans clustering.

In [4]:
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    """
    Creates features based on similarity to geographic clusters
    
    Parameters:
    n_clusters : Number of clusters to create
    gamma : Controls the influence radius of clusters
    random_state : Random seed for reproducibility
    """
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
    
    def fit(self, X, y=None, sample_weight=None):
        if hasattr(X, "values"):
            X = X.values
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, 
                             random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        if hasattr(X, "values"):
            X = X.values
        distances = np.linalg.norm(X[:, np.newaxis] - self.kmeans_.cluster_centers_, 
                                 axis=2)
        return np.exp(-self.gamma * distances ** 2)
    
    def get_params(self, deep=True):
        return {"n_clusters": self.n_clusters, 
                "gamma": self.gamma, 
                "random_state": self.random_state}
    
    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

## 3. Load and Prepare Data
Load the housing dataset and create stratified train/test splits based on income categories.

In [5]:
def load_housing_data():
    """Load California housing dataset from remote URL"""
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

# Load and prepare data
housing = load_housing_data()

# Create income categories for stratified sampling
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5]
)

# Split into stratified train/test sets
strat_train_set, strat_test_set = train_test_split(
    housing,
    test_size=0.2,
    stratify=housing["income_cat"],
    random_state=42
)

# Prepare training data
housing = strat_train_set.drop(["median_house_value", "income_cat"], axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

## 4. Define Transformation Pipelines
Create custom transformation pipelines for different feature types including ratio features, log transformations, and categorical encoding.

In [13]:
# Helper functions for ratio features
def column_ratio(X):
    """Calculate ratio between two columns"""
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    """Generate feature name for ratio features"""
    return ["ratio"]

# Pipeline for ratio features
def ratio_pipeline():
    """Pipeline for ratio features: Impute → Calculate ratio → Scale"""
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )

# Pipeline for log-transformed features
log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler()
)

# Pipeline for categorical features
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

# Default pipeline for numeric features
default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

# Initialize geographic similarity transformer
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.0, random_state=42)

## 6. Apply Preprocessing
Transform the training data and verify the output feature matrix.

In [14]:
# Apply the full preprocessing pipeline
housing_prepared = preprocessing.fit_transform(housing)

# Display resulting feature matrix shape
print("Preprocessed data shape:", housing_prepared.shape)
print("\nExpected features breakdown:")
print("  - 3 ratio features")
print("  - 5 log-transformed features")
print("  - 10 geographic similarity features")
print("  - 5 one-hot encoded categories")
print("  - 1 remaining numeric feature")
print("Total: 24 features")

Preprocessed data shape: (16512, 24)

Expected features breakdown:
  - 3 ratio features
  - 5 log-transformed features
  - 10 geographic similarity features
  - 5 one-hot encoded categories
  - 1 remaining numeric feature
Total: 24 features


---
# **Exercise**
* Try adding a SelectFromModel transformer in the preparation pipeline
to select only the most important attributes.

### **Define the Custom Transformer**
**Create `KNNPriceFeature`**
* This transformer learns the price patterns of nearby locations using `KNeighborsRegressor`.
In `fit()`, it trains on latitude/longitude and `median_house_value`.  
In `transform()`, it outputs the predicted price for any (lat, long) location.


In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsRegressor

class KNNPriceFeature(BaseEstimator, TransformerMixin):
    """
    Custom transformer that predicts housing prices using KNeighborsRegressor
    based on latitude and longitude. Outputs prediction as a new feature.
    """
    def __init__(self, n_neighbors=5, weights='distance'):
        self.n_neighbors = n_neighbors
        self.weights = weights
    
    def fit(self, X, y):
        if hasattr(X, "values"):
            X = X.values
        self.knn_ = KNeighborsRegressor(n_neighbors=self.n_neighbors, weights=self.weights)
        self.knn_.fit(X, y)
        return self

    def transform(self, X):
        if hasattr(X, "values"):
            X = X.values
        return self.knn_.predict(X).reshape(-1, 1)

### **Add It to the Preprocessing Pipeline**
**Add to Pipeline**
* We include the `KNNPriceFeature` as one of the transformation steps.  
It uses only the `latitude` and `longitude` columns, and returns one new feature: predicted price from neighbors.

In [16]:
# Instantiate the custom transformer
knn_price_feature = KNNPriceFeature(n_neighbors=5)

# Add to existing pipeline
preprocessing_with_knn = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ("geo_similarity", cluster_simil, ["latitude", "longitude"]),
    ("geo_knn_price", knn_price_feature, ["latitude", "longitude"]),
    ("cat", cat_pipeline, ["ocean_proximity"])
], remainder=default_num_pipeline)


### Step 1: **Set Up Feature Selector Using SelectFromModel**

**Feature Selector**
* We use `SelectFromModel` to keep only the most important features based on a Random Forest's feature importances. The threshold is set to "median", which keeps the top 50%.

In [17]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

# Feature selector using Random Forest
feature_selector = SelectFromModel(
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    threshold="median"  # Keep top 50% of features
)

### Step 2: **Build Full Pipeline with Feature Selection**
**Full Pipeline**
* We create a unified pipeline that handles preprocessing using knn, feature selection, and modeling in one step.

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

# Full pipeline: preprocessing_with_knn → feature selection → SVR
full_pipeline = Pipeline([
    ("preprocessing", preprocessing_with_knn),
    ("feature_selection", feature_selector),
    ("svr", SVR())
])

### Step 3: **Run RandomizedSearchCV on the New Pipeline**
**Train with RandomizedSearchCV**
* We search over hyperparameters and train the entire pipeline using 3-fold CV. All steps (including feature selection) are executed inside the CV folds.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

# Define hyperparameter distributions
param_distributions = {
    "svr__kernel": ["linear", "rbf"],
    "svr__C": loguniform(0.01, 100),
    "svr__gamma": loguniform(0.001, 1)  # used only if kernel is 'rbf'
}

# Subset data to 5,000 samples
X_small = housing[:5000]
y_small = housing_labels[:5000]

# Run search
random_search = RandomizedSearchCV(
    full_pipeline,
    param_distributions=param_distributions,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X_small, y_small)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


### Step 4: **Evaluate Best Model on Test Set**
**Test Evaluation**
* We evaluate the final pipeline (including automatic feature selection) on the full test set and compute RMSE.

In [None]:
# Prepare raw test data
X_test = strat_test_set.drop(["median_house_value", "income_cat"], axis=1)
y_test = strat_test_set["median_house_value"].copy()

# Predict with best pipeline
final_model = random_search.best_estimator_
y_pred = final_model.predict(X_test)

# Compute RMSE manually
from sklearn.metrics import mean_squared_error
import numpy as np

test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Final Test Set RMSE:", test_rmse)