In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error

hdb_resale_prices = pd.read_csv("final_hdb_resale_prices.csv")

In [3]:
# drop the first column of hdb resale prices database
hdb_resale_prices.drop(columns=hdb_resale_prices.columns[0], inplace=True)
hdb_resale_prices.head()

Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,flat_model,resale_price,Latitude,Longitude,cpi_multiplier,...,year,no.of bto,resale application,demand ratio,remaining_lease_months,address,Distance to nearest MRT,Distance to CBD,Distance to nearest mall,isMatureEstate
0,2017-01,ANG MO KIO,3 ROOM,2,67.0,New Generation,250000.0,1.370943,103.837975,1.02372,...,2017,14464,20894.0,1.02,727,108 ANG MO KIO AVE 4,166.825954,9696.234644,896.944633,1
1,2017-01,ANG MO KIO,3 ROOM,2,67.0,New Generation,262000.0,1.380709,103.835368,1.02372,...,2017,14464,20894.0,1.02,749,602 ANG MO KIO AVE 5,532.155055,10808.14355,1527.724531,1
2,2017-01,ANG MO KIO,3 ROOM,5,68.0,New Generation,265000.0,1.366201,103.857201,1.02372,...,2017,14464,20894.0,1.02,744,465 ANG MO KIO AVE 10,945.375055,9087.92059,880.143181,1
3,2017-01,ANG MO KIO,3 ROOM,2,67.0,New Generation,265000.0,1.381041,103.835132,1.02372,...,2017,14464,20894.0,1.02,749,601 ANG MO KIO AVE 5,498.419646,10848.69772,1571.4469,1
4,2017-01,ANG MO KIO,3 ROOM,2,68.0,New Generation,275000.0,1.376807,103.842018,1.02372,...,2017,14464,20894.0,1.02,756,150 ANG MO KIO AVE 5,636.982785,10284.07957,695.573226,1


### Feature Selection and Engineering - Support Vector Regression

In [4]:
# get resale flat data from 2020 to 2022
hdb_resale_prices = hdb_resale_prices.loc[hdb_resale_prices["year"].isin([2020, 2021, 2022])]

In [5]:
# calculate cpi adjusted price per sqm
hdb_resale_prices['cpi_adjusted_price_per_sqm'] = hdb_resale_prices['cpi_adjusted_price'] / hdb_resale_prices['floor_area_sqm'] 

In [6]:
df_svr = hdb_resale_prices[['storey_range', 
                           'no.of bto', 
                           'floor_area_sqm', 
                           'demand ratio', 
                           'remaining_lease_months', 
                           'Distance to nearest MRT', 
                           'Distance to CBD', 
                           'Distance to nearest mall',
                           'isMatureEstate',
                           'cpi_adjusted_price_per_sqm']]
df_svr.head()

Unnamed: 0,storey_range,no.of bto,floor_area_sqm,demand ratio,remaining_lease_months,Distance to nearest MRT,Distance to CBD,Distance to nearest mall,isMatureEstate,cpi_adjusted_price_per_sqm
63275,5,7314,73.0,1.13,667,908.970521,9026.295266,775.593122,1,3619.754795
63276,20,7314,70.0,1.13,1100,687.185319,9015.122154,561.028714,1,6695.082857
63277,2,7314,73.0,1.13,676,586.98069,8949.443986,489.97866,1,3141.673973
63278,5,7314,73.0,1.13,663,800.631299,9123.690385,806.304304,1,3824.646575
63279,8,7314,68.0,1.13,708,927.322849,9734.443856,1059.12169,1,3226.041176


In [None]:
df_svr.info()

In [None]:
# plot correlation heatpmap 
sns.set (rc = {'figure.figsize':(12, 12)})
sns.heatmap(df_svr.corr(), annot=True)

### Sampling Data for Kernel Selection

In [None]:
sample_df = df_svr.sample(n=10000, random_state=1)

sample_df.info()

In [None]:
# split data into independent and dependent variables
x_sample = df_svr.drop(columns=['cpi_adjusted_price_per_sqm'])
y_sample = df_svr[['cpi_adjusted_price_per_sqm']]

### Comparing Performance of Linear and RBF Kernels

In [None]:
# split data into training and testing sets
# initiatie standard scalers for x and y variables
sc_x = StandardScaler()
sc_y = StandardScaler()

sample_X_train, sample_X_test, sample_y_train, sample_y_test = train_test_split(x_sample, y_sample, test_size = 0.2, random_state = 0)

# standardize x and y values
sample_X_train = sc_x.fit_transform(sample_X_train)
sample_y_train = sc_y.fit_transform(sample_y_train)

In [None]:
sample_X_train

In [None]:
sample_y_train

##### Linear Kernel

In [None]:
# Fit the linear kernel model with training data

regressor = SVR(kernel = 'linear', C=1, epsilon=0.1)
regressor.fit(sample_X_train, sample_y_train)

In [None]:
# training data performance 
sample_y_train_pred = regressor.predict(sample_X_train)
sample_y_train_pred = sc_y.inverse_transform([sample_y_train_pred])

# return mse, rmse, and r2 score
mse = mean_squared_error(sc_y.inverse_transform(sample_y_train), sample_y_train_pred[0])
r2 = r2_score(sc_y.inverse_transform(sample_y_train), sample_y_train_pred[0])
msle = mean_squared_log_error(sc_y.inverse_transform(sample_y_train), sample_y_train_pred[0])
# adjusted r-sqaured
adj_r2 = 1 - (1-r2)*(len(sample_y_train)-1)/(len(sample_y_train)-sample_X_train.shape[1]-1)

print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 
print("MSLE: ", msle) 
print("R2 score: " + str(r2))
print("Adjusted R2: ", adj_r2)

In [None]:
# predict resale prices using the model with standardized test data

standardScaler = StandardScaler()
X_test = standardScaler.fit_transform(sample_X_test)

y_sample_pred = regressor.predict(X_test)

In [None]:
# evaluation metrics for SVR with linear kernel

# inverse standardize predicted values
sample_pred = sc_y.inverse_transform([y_sample_pred])
sample_pred

# return mse, rmse, and r2 score
mse = mean_squared_error(sample_y_test, sample_pred[0])
r2 = r2_score(sample_y_test, sample_pred[0])
msle = mean_squared_log_error(sample_y_test, sample_pred[0])
# adjusted r-sqaured
adj_r2 = 1 - (1-r2)*(len(sample_y_test)-1)/(len(sample_y_test)-sample_X_test.shape[1]-1)

print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 
print("MSLE: ", msle) 
print("R2 score: " + str(r2))
print("Adjusted R2: ", adj_r2)

##### RBF Kernel

In [None]:
# Fit the RBF kernel model with training data
regressor = SVR(kernel = 'rbf', C=1, epsilon=0.1)
regressor.fit(sample_X_train, sample_y_train)

In [None]:
# training data performance 
sample_y_train_pred = regressor.predict(sample_X_train)
sample_y_train_pred = sc_y.inverse_transform([sample_y_train_pred])

# return mse, rmse, and r2 score
mse = mean_squared_error(sc_y.inverse_transform(sample_y_train), sample_y_train_pred[0])
r2 = r2_score(sc_y.inverse_transform(sample_y_train), sample_y_train_pred[0])
msle = mean_squared_log_error(sc_y.inverse_transform(sample_y_train), sample_y_train_pred[0])
# adjusted r-sqaured
adj_r2 = 1 - (1-r2)*(len(sample_y_train)-1)/(len(sample_y_train)-sample_X_train.shape[1]-1)

print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 
print("MSLE: ", msle) 
print("R2 score: " + str(r2))
print("Adjusted R2: ", adj_r2)

In [None]:
# predict resale prices using the model with standardized test data

standardScaler = StandardScaler()
X_test = standardScaler.fit_transform(sample_X_test)

y_sample_pred = regressor.predict(X_test)

In [None]:
# evaluation metrics for SVR with RBF kernel

# inverse standardize predicted values
sample_pred = sc_y.inverse_transform([y_sample_pred])
sample_pred

# return mse, rmse, and r2 score
mse = mean_squared_error(sample_y_test, sample_pred[0])
r2 = r2_score(sample_y_test, sample_pred[0])
msle = mean_squared_log_error(sample_y_test, sample_pred[0])

print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 
print("MSLE: ", msle)
print("R2 score: " + str(r2))

## Train SVR Model (RBF Kernel) on Full Dataset

### Declare Independent and Target Variables

In [7]:
# declare independent and dependent variables
X = df_svr.drop(columns=['cpi_adjusted_price_per_sqm'])
y = df_svr[['cpi_adjusted_price_per_sqm']]

### Split Data into Training and Test Sets

In [8]:
#Split the dataset into training and testing data
from sklearn.preprocessing import StandardScaler

# initiatie standard scalers for x and y variables
sc_x = StandardScaler()
sc_y = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# standardize x and y values
X_train = sc_x.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

In [9]:
X_train

array([[ 0.34644073, -1.31652382,  0.23716048, ...,  0.3730184 ,
        -0.63923168, -0.81134857],
       [-0.63185115,  1.20200159, -1.31584059, ...,  0.99182823,
         1.45452024, -0.81134857],
       [ 0.34644073, -0.04402651,  0.49599399, ...,  1.59029534,
         0.07044722, -0.81134857],
       ...,
       [-0.63185115, -0.04402651,  0.58227183, ...,  0.10222357,
        -1.15574021, -0.81134857],
       [-1.12099709, -0.04402651,  1.05679993, ...,  0.03840014,
        -0.18044264, -0.81134857],
       [ 0.34644073,  1.20200159, -0.23736763, ...,  0.27893022,
         1.18225436, -0.81134857]])

In [10]:
y_train

array([[-1.38027519],
       [ 0.84366218],
       [-0.74534533],
       ...,
       [ 0.57454224],
       [-1.1341246 ],
       [ 0.77895326]])

### Train Support Vector Regression Model

In [11]:
# Fit the model with training data
regressor = SVR(kernel = 'rbf', C=1, epsilon=0.1)
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1)

### Evaluate Support Vector Regression Model

In [12]:
# training data performance 
y_train_pred = regressor.predict(X_train)
y_train_pred = sc_y.inverse_transform([y_train_pred])

# return mse, rmse, and r2 score
mse = mean_squared_error(sc_y.inverse_transform(y_train), y_train_pred[0])
r2 = r2_score(sc_y.inverse_transform(y_train), y_train_pred[0])
msle = mean_squared_log_error(sc_y.inverse_transform(y_train), y_train_pred[0])
# adjusted r-sqaured
adj_r2 = 1 - (1-r2)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)

print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 
print("MSLE: ", msle) 
print("R2 score: " + str(r2))
print("Adjusted R2: ", adj_r2)

MSE:  195991.3652883681
RMSE:  442.70912040341807
MSLE:  0.006394555304475398
R2 score: 0.9080952747072375
Adjusted R2:  0.9080819887710756


In [None]:
# predict resale prices using the model with standardized test data

# standardize test data
standardScaler = StandardScaler()
X_test_norm = standardScaler.fit_transform(X_test)

y_pred = regressor.predict(X_test_norm)

y_pred

In [None]:
# inverse standardize predicted values
y_pred = sc_y.inverse_transform([y_pred])
y_pred

In [None]:
mse = mean_squared_error(y_test, y_pred[0])
msle = mean_squared_log_error(y_test, y_pred[0])

print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 
print("MSLE: ", msle)

In [None]:
# derive r squared score
r2 = r2_score(y_test, y_pred[0])
print("R2 score: " + str(r2))

# adjusted r-sqaured
adj_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("Adjusted R2: ", adj_r2)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
ax.scatter(y_test, y_pred[0], color = 'purple')

# Add labels and title
plt.title('Predicted Resale Prices Against Actual Resale Prices (SVR)')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')

plt.grid()
plt.show()

### K-Fold Cross Validation for Support Vector Regression Model

In [None]:
from sklearn.model_selection import cross_val_score, KFold

# Create SVR model
svr_model = SVR(kernel='rbf', C=1, epsilon=0.1)

# Create k-fold cross-validation object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
sc_x = StandardScaler()
sc_y = StandardScaler()

# Perform cross-validation for MSE
r2_scores = cross_val_score(svr_model, sc_x.fit_transform(X), sc_x.fit_transform(y), cv=kfold, scoring='r2')
print('Cross-validation scores (R2):', r2_scores)

# Calculate average performance metric across all folds
mean_score = r2_scores.mean()

print('Average R2:', mean_score)

In [None]:
# Perform cross-validation for RMSE
rmse_scores = cross_val_score(svr_model, X, y, cv=kfold, scoring='neg_root_mean_squared_error')
print('Cross-validation scores (RMSE):', rmse_scores)

# Calculate average performance metric across all folds
mean_score = -rmse_scores.mean()

print('Average RMSE:', mean_score)
