# Import Libraries

In [43]:
# Import Libaries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pd.set_option('display.max_columns', None)

# Load Dataset

In [3]:
# Load in Dataset
combined_resale_encode_df = pd.read_pickle('../assets/data_clean/resale_combine_encode_clean.pkl')

In [4]:
combined_resale_encode_df.info(verbose = True) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903393 entries, 0 to 903392
Data columns (total 354 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    floor_area_sqm                         float64
 1    lease_commence_date                    int64  
 2    resale_price                           float64
 3    remaining_lease                        int64  
 4    longitude                              float64
 5    latitude                               float64
 6    month                                  int64  
 7    year                                   int64  
 8    P1_employed                            float64
 9    P1_unemployed                          float64
 10   P1_inactive                            float64
 11   P2_pre_primary                         float64
 12   P2_primary                             float64
 13   P2_secondary                           float64
 14   P2_post_secondary                 

# Plot Baseline Model

## Linear Regression Model

In [5]:
# Get all columns
all_column = combined_resale_encode_df.columns.tolist()

# Remove resale price from all_column 
all_column.remove('resale_price')

# Set feature list
features = all_column

# Set Feature
X = combined_resale_encode_df[features]

# Set Target Variable
y = combined_resale_encode_df['resale_price']

In [6]:
# Create Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [7]:
# Instantiate Linear Regression Model
lr = LinearRegression()

In [18]:
# Standardise the predictors
# Instantiating Standard Scaler
sc = StandardScaler()

# Standardise Predictors
Z_train = sc.fit_transform(X_train) # standard normal distribution
Z_test = sc.transform(X_test)

In [19]:
# Fit the model
lr.fit(Z_train,y_train)

LinearRegression()

In [20]:
# Generate predictions.
preds = lr.predict(Z_test)

In [22]:
y_pred_train = lr.predict(Z_train)
y_pred_test = lr.predict(Z_test)

rmse_train = mean_squared_error(y_true =y_train, y_pred=y_pred_train, squared=False)
rmse_test = mean_squared_error(y_true =y_test, y_pred=y_pred_test, squared=False)

print(rmse_train, rmse_test)

62486.22454507746 62321.13657681616


In [23]:
# Check the R^2 on the training and testing sets.

print(f'R^2 on testing set: {r2_score(y_train, y_pred_train)}')
print(f'R^2 on training set: {r2_score(y_test, y_pred_test)}')

R^2 on testing set: 0.8559408558006292
R^2 on training set: 0.8558502965751387


- R2 Score for both Training and Testing data is 0.85. 
    - The score is quite similar, indicating that the data is not overfitted.
    

In [12]:
# Train Score
print(f'score on testing set: {lr.score(X_train,y_train)}')

# Test Score
print(f'score on training set: {lr.score(X_test,y_test)}')

score on testing set: 0.8559439725489412
score on training set: 0.8558532662425785


In [14]:
# Cross Validation Score
cross_val_score(lr, X_train, y_train, cv=5).mean()

0.8558644380920495

- Cross Validation and Test Score is quite simiilar in this case. 
- This indicates that its is a representative dataset.

In [17]:
# Scale our data.
# Relabeling scaled data as "Z" is common.
sc = StandardScaler()
Z_train = sc.fit_transform(X_train) # standard normal distribution
Z_test = sc.transform(X_test)

## Lasso Model

In [27]:
# Plot Lasso Model

lasso_model = Lasso(alpha =10)
# Fit.
lasso_model.fit(Z_train, y_train)
# Evaluate model using rmse.
y_pred_train = lasso_model.predict(Z_train)
y_pred_test = lasso_model.predict(Z_test)

rmse_train_l = mean_squared_error(y_true =y_train, y_pred=y_pred_train, squared=False)
rmse_test_l = mean_squared_error(y_true =y_test, y_pred=y_pred_test, squared=False)

# if mse, squared=True

print(rmse_train_l, rmse_test_l)

  model = cd_fast.enet_coordinate_descent(


62498.19772530211 62333.0251450897


## Ridge Model

In [29]:
# Instantiate.
ridge_model = Ridge(alpha =10)
# Fit.
ridge_model.fit(Z_train, y_train)
# Evaluate model using rmse.
y_pred_train = ridge_model.predict(Z_train)
y_pred_test = ridge_model.predict(Z_test)

rmse_train_r = mean_squared_error(y_true =y_train, y_pred=y_pred_train, squared=False)
rmse_test_r = mean_squared_error(y_true =y_test, y_pred=y_pred_test, squared=False)

# if mse, squared=True

print(rmse_train_r, rmse_test_r)

62485.6332950189 62320.40061309695


# GridsearchCV Hyperparameter Tuning

## Ridge Regression

### Hyperparameter Tuning

In [61]:
# Instantiate Standard Scaler
ss = StandardScaler()

# Instantiate Model ridge for gridsearch cv
model_ridge = Ridge()


# Instantiate pipe
pipe_gs = Pipeline([
        ('ss', ss),
        ('rg', model_ridge)
        ])


# Instantiate Parameters
pipe_gs_param = {'ss__with_mean': [True, False], 
                 'ss__with_std': [True, False],    
                'rg__alpha':[1,5,10,15,20]                               
                }

In [51]:
# fit pipe
pipe_gs.fit(Z_train, y_train)

Pipeline(steps=[('ss', StandardScaler()), ('rg', Ridge())])

In [53]:
# pipe score
pipe_gs.score(X_test, y_test)



-1792124.304411102

In [56]:
# define the grid search
Ridge_reg= GridSearchCV(pipe_gs, pipe_gs_param,cv=5,verbose=1)

# fit grid search mode
Ridge_reg.fit(Z_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('rg', Ridge())]),
             param_grid={'rg__alpha': [1, 5, 10, 15, 20],
                         'ss__with_mean': [True, False],
                         'ss__with_std': [True, False]},
             verbose=1)

In [57]:
Ridge_reg.best_score_

0.8558645524870083

In [58]:
Ridge_reg.best_params_

{'rg__alpha': 1, 'ss__with_mean': True, 'ss__with_std': False}

### Modeling with shortlisted Hyperparameter for Ridge

In [63]:
# Instantiate Standard Scaler with new value
ss_ridge_gridcv = StandardScaler(with_mean = True, with_std = False)

# Standardise Predictors using ss with updated parameters
Z_train_ridge_gridcv = ss_ridge_gridcv.fit_transform(X_train) # standard normal distribution
Z_test_ridg_gridcv = ss_ridge_gridcv.transform(X_test)

# # Instantiate.
ridge_model_grid_cv = Ridge(alpha =1)
# Fit
ridge_model_grid_cv.fit(Z_train_ridge_gridcv, y_train)
# Evaluate model using rmse.
y_pred_train_ridge_grid_cv = ridge_model.predict(Z_train_ridge_gridcv)
y_pred_test_ridge_grid_cv = ridge_model.predict(Z_test_ridg_gridcv)

rmse_train_r = mean_squared_error(y_true =y_train, y_pred=y_pred_train_ridge_grid_cv, squared=False)
rmse_test_r = mean_squared_error(y_true =y_test, y_pred=y_pred_test_ridge_grid_cv, squared=False)
# if mse, squared=True

print(rmse_train_r, rmse_test_r)

145404127.35098472 145347167.3577067


- Observed that setting the Standard Scaler with the given parameters resulted in a higher rmse score.
- Next: rerun model where alpha is 1.

In [66]:
# Instantiate.
ridge_model = Ridge(alpha =1)
# Fit.
ridge_model.fit(Z_train, y_train)
# Evaluate model using rmse.
y_pred_train = ridge_model.predict(Z_train)
y_pred_test = ridge_model.predict(Z_test)

rmse_train_r = mean_squared_error(y_true =y_train, y_pred=y_pred_train, squared=False)
rmse_test_r = mean_squared_error(y_true =y_test, y_pred=y_pred_test, squared=False)

# if mse, squared=True

print(rmse_train_r, rmse_test_r)

62485.5270833046 62320.38023604801


- Observed that by setting rmse score to 1, the resulted rmse score is slightly better at 62485.527.

# Lasso Regression

### Hyper Parameter Tuning

In [90]:
lasso_alpha = [1,5,10,15,20]   

for i in lasso_alpha:   

    # Plot Lasso Model

    lasso_model = Lasso(alpha =i)
    # Fit.
    lasso_model.fit(Z_train, y_train)
    # Evaluate model using rmse.
    y_pred_train = lasso_model.predict(Z_train)
    y_pred_test = lasso_model.predict(Z_test)

    rmse_train_l = mean_squared_error(y_true =y_train, y_pred=y_pred_train, squared=False)
    rmse_test_l = mean_squared_error(y_true =y_test, y_pred=y_pred_test, squared=False)

    # if mse, squared=True
    print("alpha: ", i)
    print(rmse_train_l, rmse_test_l)

  model = cd_fast.enet_coordinate_descent(


alpha:  1
62488.60218384262 62322.95344022425


  model = cd_fast.enet_coordinate_descent(


alpha:  5
62492.47655846235 62326.83011113574


  model = cd_fast.enet_coordinate_descent(


alpha:  10
62498.19772530211 62333.0251450897


  model = cd_fast.enet_coordinate_descent(


alpha:  15
62503.37485582632 62338.94439016358


  model = cd_fast.enet_coordinate_descent(


alpha:  20
62510.04698176871 62346.38822760046


- It is observed that the RMSE score remains relatively high at around 62488.602 and 62322.953 for RMSE train and RMSE test data for Lasso Regression where alpha is 1.
- Next: Further analysis will be done on the features to better understand how different features affect RMSE.
    - Features will also be analysed to understand the trends.