<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 2: Model selection

> Authors: Lim Zheng Gang, Eugene Matthew Cheong, Pius Yee

---

### Notebook 3. Model selection
##### In this Notebook 3, we'll focus on model building. We'll compare three regression techniques: Linear Regression, Ridge Regression, and Lasso Regression, to identify the model that best fits our dataset.

In [14]:
# import libraries

from datetime import datetime
import math

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


import scipy
from scipy.stats import pearsonr
from scipy.spatial.distance import cityblock

from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.linear_model import LassoCV

import category_encoders as ce

import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 10)

##### Read CSV file

In [15]:
# final dataset after cleaning and feature engineering

hdb_df = pd.read_csv("../datasets/hdb_final.csv",index_col=0)
hdb_df.head()

Unnamed: 0,id,resale_price,mid,floor_area_sqft,max_floor_lvl,mrt_nearest_distance,mature,planning_area_category_Group1,planning_area_category_GroupA,planning_area_category_GroupCM,planning_area_category_GroupCQS,planning_area_category_GroupJB,planning_area_category_GroupPWC,planning_area_category_GroupYH,tenure,tenure_buckets_0-10,year_category_Group0,year_category_Group1,year_category_Group2,is_premium,is_terrace,is_superlargeterrace,flat_type_1 ROOM,flat_type_2 ROOM,is_pre_war,from_centre_distance,mh
0,88471,680000.0,11,968.76,25,330.0830689708,1,0,0,0,0,0,0,0,10,1,0,1,0,0,0,0,0,0,0,5.3777515099,1.0377431231
1,122598,665000.0,8,1399.32,9,903.6597028521,1,1,0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0,0,8.0532982362,0.8558839198
2,170897,838000.0,14,1550.016,16,1334.2511968565,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,12.225337458,1.786452583
3,86070,550000.0,3,1108.692,11,907.4534838706,1,1,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,9.3372050947,0.9595222727
4,153632,298000.0,2,893.412,4,412.343031726,0,0,0,0,0,0,0,1,30,0,0,0,1,0,0,0,0,0,0,15.6602881769,1.5022491892


##### Create a function that takes a dataset and model as input and returns the predicted scores for the model.

In [16]:
#define a function for model testing

def val_score_test(dataset,model, k=5):
    scaler = StandardScaler()

    #define X and y
    X = dataset.drop(columns=['id','resale_price'])
    y = dataset['resale_price'].values
    X_scaled = scaler.fit_transform(X)

    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

    #split train and test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, np.log1p(y), random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #return the result
    return {"R2": round(metrics.r2_score(y_test, y_pred),4) ,
            "R2 for train set": round(model.score(X_train,y_train),4), 
            "R2 for test set": round(model.score(X_test,y_test),4), 
            "cross val": round(cross_val_score(model, X_train, y_train, cv=10).mean(),4), 
            "MSE": round(metrics.mean_squared_error(np.exp(y_test)-1, np.exp(y_pred)-1),4),
            "RMSE": round(metrics.mean_squared_error(np.exp(y_test)-1, np.exp(y_pred)-1, squared=False),4),
            "MAE": round(metrics.mean_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1),4)
            }


##### Scores for Linear Regression
##### Linear Regression finds the equation for that line, allowing you to predict a value based on another. Great for understanding relationships, but can be prone to overfitting on complex data.

In [17]:
lr = LinearRegression()

In [18]:
val_score_test(hdb_df,lr)

{'R2': 0.9015,
 'R2 for train set': 0.903,
 'R2 for test set': 0.9015,
 'cross val': 0.9029,
 'MSE': 2103703193.9542,
 'RMSE': 45866.1443,
 'MAE': 33525.4368}

##### Scores for Ridge Regression
##### Like linear regression with a twist - it penalizes the sum of squared coefficients, shrinking them all but never to zero. This helps reduce the impact of irrelevant features and improves performance on datasets with correlated features.

In [19]:
rr = Ridge()

In [20]:
val_score_test(hdb_df,rr)

{'R2': 0.9015,
 'R2 for train set': 0.903,
 'R2 for test set': 0.9015,
 'cross val': 0.9029,
 'MSE': 2103705914.3107,
 'RMSE': 45866.174,
 'MAE': 33525.4158}

##### Score for Lasso Regression
##### Similar to linear regression, but it penalizes large coefficients, shrinking some to zero. This can automatically select important features and reduces model complexity, making it useful for feature selection and handling high-dimensional data.

In [21]:
# set alpha lasso
alpha_space = np.logspace(-4, 4, 100)
lasso= LassoCV(alphas = alpha_space, random_state = 42)

In [22]:
val_score_test(hdb_df,lasso)

{'R2': 0.9015,
 'R2 for train set': 0.903,
 'R2 for test set': 0.9015,
 'cross val': 0.9029,
 'MSE': 2101720795.2332,
 'RMSE': 45844.5285,
 'MAE': 33518.2454}

### Summary table for model scores

In [23]:
#create a summary table for scores between models
score = [keys for keys, values in val_score_test(hdb_df,lr).items()]
lr_score = [str(values) for keys, values in val_score_test(hdb_df,lr).items()]
rr_score = [str(values) for keys, values in val_score_test(hdb_df,rr).items()]
lasso_score = [str(values) for keys, values in val_score_test(hdb_df,lasso).items()]

pd.DataFrame({"Score": score, "Linear Regression ": lr_score, "Ridge Regression ": rr_score, "Lasso Regression ": lasso_score}).set_index("Score")

Unnamed: 0_level_0,Linear Regression,Ridge Regression,Lasso Regression
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R2,0.9015,0.9015,0.9015
R2 for train set,0.903,0.903,0.903
R2 for test set,0.9015,0.9015,0.9015
cross val,0.9029,0.9029,0.9029
MSE,2103703193.9542,2103705914.3107,2101720795.2332
RMSE,45866.1443,45866.174,45844.5285
MAE,33525.4368,33525.4158,33518.2454


### Conclusion
##### Based on the scores above, Lasso Regression is selected as our model since it has the slightly better scores especially in RMSE.

##### Exporting Pickle files for Kaggle submission use

In [24]:
scaler = StandardScaler()
X = hdb_df.drop(columns=['id','resale_price'])
X_fit = scaler.fit(X)

# To write out the pickle file
with open('../datasets/scaler.pkl', 'wb') as scaler_handle:
    pickle.dump(X_fit, scaler_handle)

In [25]:
# To write out the pickle file
with open('../datasets/lasso.pkl', 'wb') as handle:
    pickle.dump(lasso, handle)