In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium 
import requests
import json
from geopy.distance import geodesic
from sklearn import preprocessing

hdb_resale_prices = pd.read_csv("final_hdb_resale_prices.csv")

In [None]:
# drop the first column of hdb resale prices database
hdb_resale_prices.drop(columns=hdb_resale_prices.columns[0], inplace=True)
hdb_resale_prices.head()

### Feature Selection and Engineering - Support Vector Regression

In [None]:
# get resale flat data from 2020 to 2022
hdb_resale_prices = hdb_resale_prices.loc[hdb_resale_prices["year"].isin([2020, 2021, 2022])]

In [None]:
# calculate cpi adjusted price per sqm
hdb_resale_prices['cpi_adjusted_price_per_sqm'] = hdb_resale_prices['cpi_adjusted_price'] / hdb_resale_prices['floor_area_sqm'] 

In [None]:
df_svr = hdb_resale_prices[['storey_range', 
                           'flat_type',
                           'no.of bto', 
                           'floor_area_sqm', 
                           'resale application', 
                           'demand ratio', 
                           'remaining_lease_months', 
                           'Distance to nearest MRT', 
                           'Distance to CBD', 
                           'isMatureEstate',
                           'cpi_adjusted_price_per_sqm']]
df_svr.head()

In [None]:
#one hot encoding for flat type
df_svr = pd.get_dummies(df_svr, columns=['flat_type'], prefix = ['type'])
df_svr.head()

In [None]:
df_svr.info()

In [None]:
# plot correlation heatpmap 
sns.set (rc = {'figure.figsize':(12, 12)})
sns.heatmap(df_svr.drop(columns=["type_3 ROOM", "type_4 ROOM", "type_5 ROOM", "type_EXECUTIVE"]).corr(), annot=True)

### Declare Independent and Target Variables

In [None]:
# declare independent and dependent variables
X = df_svr.drop(columns=['cpi_adjusted_price_per_sqm'])
y = df_svr[['cpi_adjusted_price_per_sqm']]

### K-Fold Cross Validation for Support Vector Regression Model

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVR

# Create SVR model
svr_model = SVR(kernel='rbf', C=1, epsilon=0.1)

# Create k-fold cross-validation object
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# Perform cross-validation for MSE
r2_scores = cross_val_score(svr_model, X, y, cv=kfold, scoring='r2')
print('Cross-validation scores (R2):', r2_scores)

# Calculate average performance metric across all folds
mean_score = r2_scores.mean()

print('Average R2:', mean_score)

In [None]:
# Perform cross-validation for MSE
mse_scores = cross_val_score(svr_model, X, y, cv=kfold, scoring='neg_mean_squared_error')
print('Cross-validation scores (MSE):', mse_scores)

# Calculate average performance metric across all folds
mean_score = -mse_scores.mean()

print('Average MSE:', mean_score)

In [None]:
# Perform cross-validation for RMSE
rmse_scores = cross_val_score(svr_model, X, y, cv=kfold, scoring='neg_root_mean_squared_error')
print('Cross-validation scores (RMSE):', rmse_scores)

# Calculate average performance metric across all folds
mean_score = -rmse_scores.mean()

print('Average RMSE:', mean_score)


### Split Data into Training and Test Sets

In [None]:
#Split the dataset into training and testing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# initiatie standard scalers for x and y variables
sc_x = StandardScaler()
sc_y = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# standardize x and y values
X_train = sc_x.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

In [None]:
X_train

In [None]:
y_train

### Train Support Vector Regression Model

In [None]:
# Fit the model with training data
from sklearn.svm import SVR

regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

In [None]:
# predict resale prices using the model with standardized test data

# standardize test data
standardScaler = StandardScaler()
X_test_norm = standardScaler.fit_transform(X_test)

y_pred = regressor.predict(X_test_norm)

y_pred

### Evaluate Support Vector Regression Model

In [None]:
from sklearn.metrics import mean_squared_error

# standardize y test values to compare with y predicted values
standardScaler = StandardScaler()
ytest = standardScaler.fit_transform(y_test)

mse = mean_squared_error(ytest, y_pred)
print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 

In [None]:
# inverse standardize predicted values
y_pred = sc_y.inverse_transform([y_pred])
y_pred

In [None]:
# derive r squared score
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred[0])
print("R2 score: " + str(r2))

In [None]:
# derive mean absolute error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred[0])

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
ax.scatter(y_test, y_pred[0], color = 'purple')

# Add labels and title
plt.title('Predicted Resale Prices Against Actual Resale Prices (SVR)')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')

plt.show()