# Final Regression Pipeline (v2) ðŸš€
This notebook demonstrates the complete, clean Machine Learning pipeline:
- Load data
- Feature engineering
- Standardization
- Final model training (custom Gradient Descent)
- Evaluation on the test set
- Saving & loading the model
- Sample predictions

This is the final pipeline demo.

In [3]:
import sys
sys.path.append("..")

from src.config import Config
from src.data_loader import load_raw_data
from src.feature_engineering import apply_feature_engineering
from src.preprocessing import compute_standardization_params, apply_standardization
from src.gradient_descent import LinearRegressionGD
from src.evaluation import evaluate_regression
from src.model_io import save_model, load_model

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# Load data
config = Config()
df = load_raw_data(config)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
# FEATURE ENGINEERING
df_fe = apply_feature_engineering(df)
df_fe.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,...,ocean_proximity_NEAR OCEAN,log_median_income,log_total_rooms,log_total_bedrooms,log_population,log_households,rooms_per_household,bedrooms_per_room,population_per_household,median_income_sq
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,...,False,2.23272,6.781058,4.867534,5.777652,4.844187,6.984127,0.146591,2.555556,69.308955
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,...,False,2.230165,8.86785,7.009409,7.784057,7.037906,6.238137,0.155797,2.109842,68.913242
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,...,False,2.11111,7.291656,5.252273,6.20859,5.181784,8.288136,0.129516,2.80226,52.669855
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,...,False,1.893579,7.150701,5.463832,6.326149,5.393628,5.817352,0.184458,2.547945,31.844578
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,...,False,1.578195,7.395108,5.638355,6.338594,5.560682,6.281853,0.172096,2.181467,14.793254


In [6]:
# TRAINâ€“TEST SPLIT
X = df_fe.drop(columns=["median_house_value"])
y = df_fe["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=config.test_size, random_state=config.random_state
)

In [7]:
# STANDARDIZATION
std_params = compute_standardization_params(X_train)
X_train_scaled = apply_standardization(X_train, std_params)
X_test_scaled  = apply_standardization(X_test, std_params)

In [8]:
# TRAIN THE FINAL MODEL
model = LinearRegressionGD(
    learning_rate=config.learning_rate,
    n_iterations=config.n_iterations,
    l1_lambda=config.l1_lambda,
    l2_lambda=config.l2_lambda,
)

model.fit(X_train_scaled.to_numpy(), y_train.to_numpy())

<src.gradient_descent.LinearRegressionGD at 0x120494cb0>

In [9]:
# EVALUATE
y_pred = model.predict(X_test_scaled.to_numpy())
metrics = evaluate_regression(y_test.to_numpy(), y_pred)
metrics

{'mae': np.float64(53437.27086606446),
 'rmse': np.float64(74661.01403801827),
 'r2': np.float64(0.5746162373926889)}

In [10]:
# SAVE THE MODEL
save_model(
    filepath="../models/model_v2.npz",
    model=model,
    std_params=std_params,
    feature_names=X_train_scaled.columns.tolist(),
    config=config,
)

Model saved to ../models/model_v2.npz


In [11]:
# LOAD THE MODEL
loaded = load_model("../models/model_v2.npz")
loaded

{'weights': array([-57826.83544705, -59797.61296197,  14947.4033175 ,  -1514.54418978,
         -3509.91815718,  -3899.20373423,  12331.77077569, 142110.80929029,
        -14814.33283928,   1785.70045111,  -2966.00483433,   -443.74191774,
        -26543.20421371,  23940.48127755,   9309.5374842 , -67265.01521471,
         35874.17362743,   2008.11629612,  20796.94878161,   5309.28434462,
        -39377.3862873 ]),
 'bias': array(207194.69373789),
 'mean': array([-1.19582290e+02,  3.56431492e+01,  2.86082849e+01,  2.64200478e+03,
         5.38496851e+02,  1.42645300e+03,  4.99986919e+02,  3.88075426e+00,
         3.16557655e-01,  2.42248062e-04,  1.12281977e-01,  1.26332364e-01,
         1.51885634e+00,  7.63265619e+00,  6.05690111e+00,  7.02683555e+00,
         5.98635761e+00,  5.43523502e+00,  2.12857974e-01,  3.09696119e+00,
         1.86863693e+01]),
 'std': array([2.00559281e+00, 2.13660060e+00, 1.26021177e+01, 2.17458089e+03,
        4.18994408e+02, 1.13702195e+03, 3.80956428e+02,

## Final Notes

- The full pipeline works end-to-end.
- The model can be trained, saved, loaded, and used for predictions.
- Feature engineering and standardization are stable and reusable.
- Evaluation metrics demonstrate the final modelâ€™s performance.