In [54]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, RobustScaler
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns

housing = pd.read_excel('data/Real_estate_valuation_data_set.xlsx')
housing.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [40]:
housing.tail()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
409,2013.0,13.7,4082.015,0,24.94155,121.50381,15.4
410,2012.666667,5.6,90.45606,9,24.97433,121.5431,50.0
411,2013.25,18.8,390.9696,7,24.97923,121.53986,40.6
412,2013.0,8.1,104.8101,5,24.96674,121.54067,52.5
413,2013.5,6.5,90.45606,9,24.97433,121.5431,63.9


In [41]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   X1 transaction date                     414 non-null    float64
 1   X2 house age                            414 non-null    float64
 2   X3 distance to the nearest MRT station  414 non-null    float64
 3   X4 number of convenience stores         414 non-null    int64  
 4   X5 latitude                             414 non-null    float64
 5   X6 longitude                            414 non-null    float64
 6   Y house price of unit area              414 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 22.8 KB


In [42]:
housing.describe()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,2013.148953,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,0.281995,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,2012.666667,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,2012.916667,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,2013.166667,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,2013.416667,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,2013.583333,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [43]:
histogram_x1 = alt.Chart(housing).mark_bar().encode(
    alt.X("X1 transaction date", bin=alt.Bin(maxbins=20)),
    alt.Y('count()'),
).properties(
    title='Histogram of X1 transaction date',
    width=300,
    height=200
)

histogram_x1.save('screenshots/histogram_x1.html')

In [None]:
histogram_x2 = alt.Chart(housing).mark_bar().encode(
    alt.X("X2 house age", bin=alt.Bin(maxbins=20)),
    alt.Y('count()'),
).properties(
    title='Histogram of X2 house age',
    width=300,
    height=200
)
histogram_x2.save('screenshots/histogram_x2.html')

In [None]:
histogram_x3 = alt.Chart(housing).mark_bar().encode(
    alt.X("X3 distance to the nearest MRT station", bin=alt.Bin(maxbins=20)),
    alt.Y('count()'),
).properties(
    title='Histogram of X3 distance to the nearest MRT station',
    width=300,
    height=200
)

histogram_x3.save('screenshots/histogram_x3.html')

In [None]:
histogram_x4 = alt.Chart(housing).mark_bar().encode(
    alt.X("X4 number of convenience stores", bin=alt.Bin(maxbins=20)),
    alt.Y('count()'),
).properties(
    title='Histogram of X4 number of convenience stores',
    width=300,
    height=200
)
histogram_x4.save('screenshots/histogram_x4.html')

In [None]:
histogram_x5 = alt.Chart(housing).mark_bar().encode(
    alt.X("X5 latitude", bin=alt.Bin(maxbins=20)),
    alt.Y('count()'),
).properties(
    title='Histogram of X5 latitude',
    width=300,
    height=200
)
histogram_x5.save('screenshots/histogram_x5.html')

In [None]:
histogram_x6 = alt.Chart(housing).mark_bar().encode(
    alt.X("X6 longitude", bin=alt.Bin(maxbins=20)),
    alt.Y('count()'),
).properties(
    title='Histogram of X6 longitude',
    width=300,
    height=200
)

histogram_x6.save('screenshots/histogram_x6.html')

In [None]:
histogram_y = alt.Chart(housing).mark_bar().encode(
    alt.X("Y house price of unit area", bin=alt.Bin(maxbins=20)),
    alt.Y('count()'),
).properties(
    title='Histogram of Y house price of unit area',
    width=300,
    height=200
)
histogram_y.save('screenshots/histogram_y.html')

In [44]:
# Calculate correlation matrix
correlation_matrix = housing.corr()

# Reshape the correlation matrix for Altair
correlation_data = correlation_matrix.stack().reset_index(name='correlation').rename(columns={'level_0': 'variable1', 'level_1': 'variable2'})

# Create a heatmap
heatmap = alt.Chart(correlation_data).mark_rect().encode(
    x='variable1:N',
    y='variable2:N',
    color=alt.Color('correlation:Q', scale=alt.Scale(scheme='redblue', domain=[-1, 1]), title='Correlation'),
    tooltip=['variable1:N', 'variable2:N', 'correlation:Q']
).properties(
    title='Correlation Heatmap',
    width=300,
    height=250
)

heatmap

In [None]:
# # X3 feature and target
# X3 = housing[['X3 distance to the nearest MRT station']]
# Y = housing['Y house price of unit area']

# # StandardScaler
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X3)

# # scaled feature and the target df
# df_scaled = pd.DataFrame({'X3_scaled': X_scaled.flatten(), 'Y': Y})

# # debugger
# X_scaled

In [None]:
# Plot the relationship between the scaled feature and the target
# plt.figure(figsize=(10, 6))
# sns.scatterplot(x='X3_scaled',y ='Y', data=df_scaled, alpha=0.7)
# plt.title('Relationship between Scaled X3 and House Price')
# plt.xlabel('X3 distance to the nearest MRT station (Scaled)')
# plt.ylabel('Y house price of unit area')
# plt.show()

In [45]:
# Drop unnecessary features
df_model = housing.loc[:, ['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'Y house price of unit area']].copy()


# Scale X3 using RobustScaler
scaler = RobustScaler()
df_model.loc[:, 'X3_scaled'] = scaler.fit_transform(df_model.loc[:, ['X3 distance to the nearest MRT station']])


# Split the dataset
X = df_model[['X2 house age', 'X3_scaled', 'X4 number of convenience stores']]
y = df_model['Y house price of unit area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test.head()

358    45.1
350    42.3
373    52.2
399    37.3
369    22.8
Name: Y house price of unit area, dtype: float64

In [46]:
# train 
model = LinearRegression()

model.fit(X_train, y_train)

In [48]:
# predict
y_pred = model.predict(X_test)

array([49.65029483, 43.63418374, 43.25559711, 40.54504643, 30.02253185,
       42.76115343, 44.05213889, 43.97461726, 28.96883548, 52.53812809,
       32.01140143, 34.41627326, 34.42500748, 28.94187375, 34.10533955,
       32.76928696, 42.76994007, 51.05718419, 31.5997044 , 46.23694741,
        1.60852866, 32.82691283, 46.2087237 , 45.73026011, 16.94793687,
       40.44963146, 16.92441071, 43.25559711, 38.26129413, 35.62242747,
       11.97136777, 37.36257399, 40.39127489, 32.33166396, 44.95567886,
       31.0769121 , 52.71901189, 17.53040182, 45.71069998, 41.24947341,
       33.5998047 , 40.04322085, 47.25263832, 37.83946884, 43.68586483,
       48.15307671, 46.43656126, 25.49063651, 51.73010247, 47.00674875,
       49.65029483, 46.65904428, 39.74781606, 42.97368549, 36.85688572,
       17.50456128, 37.56697735, 34.42301364, 29.94501022, 43.97461726,
       33.33396251, 30.25747554, 17.50456128, 12.10015575,  4.59259481,
       32.85275337, 33.48515031, 46.58254024, 33.10098841, 34.18

In [51]:
# evaluations
r_squared = r2_score(y_test, y_pred)
r_squared

0.6489726933106555

In [53]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

rmse 

7.673868078735506

In [55]:
# KNN regression model
knn_model = KNeighborsRegressor(n_neighbors=5)  
knn_model.fit(X_train, y_train)

In [57]:
# predict
y_knn_pred = knn_model.predict(X_test)

y_knn_pred

array([50.64, 43.64, 46.88, 30.98, 25.68, 42.84, 48.6 , 50.38, 16.92,
       60.36, 32.94, 30.68, 29.4 , 16.92, 41.56, 23.78, 44.78, 53.02,
       35.38, 35.82, 19.34, 24.84, 52.78, 43.14, 18.92, 30.98, 20.58,
       46.88, 47.6 , 40.26, 17.22, 28.82, 40.56, 34.88, 47.36, 35.98,
       54.46, 18.92, 42.96, 48.08, 40.82, 32.38, 42.2 , 23.9 , 39.58,
       47.66, 40.22, 20.04, 48.38, 42.2 , 50.64, 52.66, 43.02, 40.26,
       35.42, 18.92, 35.32, 34.68, 25.68, 50.38, 28.68, 38.04, 18.92,
       14.8 , 18.2 , 24.84, 33.42, 44.32, 34.3 , 27.98, 39.92, 42.66,
       53.02, 46.5 , 39.2 , 33.08, 32.76, 41.8 , 41.94, 29.74, 38.7 ,
       44.92, 27.98])

In [58]:
# Evaluate the model based on R-squared score
r_squared = r2_score(y_test, y_pred)
print(f'R-squared Score (KNN): {r_squared}')

# Evaluate the model based on Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error (RMSE) (KNN): {rmse}')

R-squared Score (KNN): 0.6489726933106555
Root Mean Squared Error (RMSE) (KNN): 7.673868078735506
