# **Ridge and Lasso Regression for HDB Resale Price Prediction**

# **Import Required Libraries**


In [27]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# **Load and Preprocess Data**
We drop the 'Town' and 'Address' columns, before splitting the dataset into train-test and applying one-hot encoding to categorical features.


In [29]:
# Load Data
df = pd.read_csv('../datasets/Final_ResaleData.csv')
df = df.drop(columns=['Town', 'Address'])

# Split Dataset into Train (80%) and Test (20%) Sets
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for categorical features (One-Hot Encoding)
categorical_columns = ["Flat_Type"]
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
    remainder='passthrough'
)


# **Train Ridge Regression Model**
Ridge Regression is a regularized version of Linear Regression that adds L2 penalty to prevent overfitting. We'll use a pipeline with preprocessing and hyperparameter tuning to find the best configuration.

Ridge Regression hyperparameters to tune:
- 'model__alpha': Regularization strength (higher values = more regularization)

After finetuning, the best hyperparameters are:
- 'model__alpha': 0.5


In [37]:
# Pipeline for Ridge Regression
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

# Hyperparameter Tuning for Ridge Regression
ridge_params = {
    'model__alpha': [0, 0.5, 1, 2, 10, 100],
    }

# HalvingGridSearchCV for Ridge Regression
ridge_grid_search = HalvingGridSearchCV(estimator=ridge_pipeline, param_grid=ridge_params, cv=3, factor=2, scoring='neg_mean_squared_error', verbose=1)
ridge_grid_search.fit(X_train, y_train)

# Best Parameters for Ridge Regression
best_ridge_params = ridge_grid_search.best_params_
print(f'Best Ridge Regression Parameters: {best_ridge_params}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 42573
max_resources_: 170294
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 6
n_resources: 42573
Fitting 3 folds for each of 6 candidates, totalling 18 fits


  return f(*arrays, *other_args, **kwargs)


----------
iter: 1
n_candidates: 3
n_resources: 85146
Fitting 3 folds for each of 3 candidates, totalling 9 fits
----------
iter: 2
n_candidates: 2
n_resources: 170292
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best Ridge Regression Parameters: {'model__alpha': 1}


# **Evaluate Ridge Regression Model**
Evaluation is carried out using RMSE, MAE and R^2 to assess prediction accuracy


In [10]:
ridge_best_model = ridge_grid_search.best_estimator_
y_pred_ridge = ridge_best_model.predict(X_test)

rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression RMSE: {rmse_ridge}")
print(f"Ridge Regression MAE: {mae_ridge}")
print(f"Ridge Regression R^2: {r2_ridge}")


Ridge Regression RMSE: 74851.41687980082
Ridge Regression MAE: 57887.12240357618
Ridge Regression R^2: 0.7993305588235631


# **Train Lasso Regression Model**
Lasso Regression is a regularized version of Linear Regression that adds L1 penalty to prevent overfitting and perform feature selection. In contrast to Ridge Regression, Lasso Regression can drive some coefficients to zero, therefore effectively removing features that do not contribute to the prediction of the dependent variable, reducing model complexity while not losing much on the model's ability to generalize on unseen data

Lasso Regression hyperparameters to tune:
- 'model__alpha': Regularization strength (higher values = more regularization, more features removed)

After finetuning, the best hyperparameters are:
- 'model__alpha': 0.5

In [40]:
# Pipeline for Lasso Regression

lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Lasso())
])

# Hyperparameter Tuning for Lasso Regressio
lasso_params = {
    'model__alpha': [1,5,10,100],
    'model__max_iter': [5000], # to ensure convergence
}

# HalvingGridSearchCV for Lasso Regression
lasso_grid_search = HalvingGridSearchCV(estimator=lasso_pipeline, param_grid=lasso_params, cv=3, factor=2, scoring='neg_mean_squared_error', verbose=1)
lasso_grid_search.fit(X_train, y_train)

# Best Parameters for Lasso Regression
best_lasso_params = lasso_grid_search.best_params_
print(f'Best Lasso Regression Parameters: {best_lasso_params}')

# Check which features were selected (non-zero coefficients)
lasso_best_model = lasso_grid_search.best_estimator_
feature_names = lasso_best_model.named_steps['preprocessor'].get_feature_names_out()
coefficients = lasso_best_model.named_steps['model'].coef_

print(f'\nFeature selection analysis:')
print(f'Total features: {len(feature_names)}')
print(f'Features with non-zero coefficients: {np.sum(coefficients != 0)}')
print(f'Features with zero coefficients: {np.sum(coefficients == 0)}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 42573
max_resources_: 170294
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 42573
Fitting 3 folds for each of 4 candidates, totalling 12 fits


  model = cd_fast.enet_coordinate_descent(


----------
iter: 1
n_candidates: 2
n_resources: 85146
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 2
n_candidates: 1
n_resources: 170292
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Lasso Regression Parameters: {'model__alpha': 1, 'model__max_iter': 5000}

Feature selection analysis:
Total features: 16
Features with non-zero coefficients: 16
Features with zero coefficients: 0


# **Evaluate Lasso Regression Model**
Evaluation is carried out using RMSE, MAE and R^2 to assess prediction accuracy


In [41]:
lasso_best_model = lasso_grid_search.best_estimator_
y_pred_lasso = lasso_best_model.predict(X_test)

rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression RMSE: {rmse_lasso}")
print(f"Lasso Regression MAE: {mae_lasso}")
print(f"Lasso Regression R^2: {r2_lasso}")


Lasso Regression RMSE: 74851.3232367543
Lasso Regression MAE: 57887.580503181474
Lasso Regression R^2: 0.7993310609192306
