In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Read data
# Change to data path on your computer
data = pd.read_csv('real_estate.csv')
# Show the description of data
data.describe()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,2013.148971,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,0.281967,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,2012.667,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,2012.917,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,2013.167,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,2013.417,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,2013.583,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [3]:
# Set to training data (x, y)
y = data['Y house price of unit area']
X = data[['X1 transaction date', 'X2 house age',
          'X3 distance to the nearest MRT station',
          'X4 number of convenience stores',
          'X5 latitude', 'X6 longitude']]

# Get the integer part of a value (truncating)
X.iloc[:, 0] = X.iloc[:, 0].apply(lambda x: x // 1)
X.iloc[:, 1] = X.iloc[:, 1].apply(lambda x: x // 1)

y_data = np.asarray(y)
x_data = np.asarray(X)
data_len = len(x_data)

# Split the training set and the validation set
x_train = x_data[:350]
y_train = y_data[:350]
x_test = x_data[350:data_len]
y_test = y_data[350:data_len]
valid_len = len(y_test)

In [4]:
# Linear Regression method
# Train model
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)
# Predict result
y_pred_regression = linear_regression.predict(x_test)

# Calculate SSE (Sum Squared Error), MSE, MAE, R-squared
print(f'The sum of squared error (SSE): '
      f'{mean_squared_error(y_test, y_pred_regression) * valid_len}')
print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_regression)}')
print(f'Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_regression)}')
print(f'R-Squared: {r2_score(y_test, y_pred_regression)}')

The sum of squared error (SSE): 4088.3846013956972
Mean Squared Error (MSE): 63.88100939680777
Mean Absolute Error (MAE): 6.039914365628544
R-Squared: 0.5997192851148421


In [5]:
# K-NN method
k = 18

def distance(array, value):
    return np.linalg.norm(array - value, ord=2, axis=1)


def find_nearest_index(array, value, k):
    array_D = distance(array, value)
    return np.argsort(array_D)[:k]


y_predict_knn = np.zeros(len(x_test))

for i in range(len(x_test)):
    indexis = find_nearest_index(x_train, x_test[i], k)
    for id in indexis:
        y_predict_knn[i] = y_predict_knn[i] + y_train[id]
    y_predict_knn[i] = y_predict_knn[i] / len(indexis)

# Calculate MSE, MAE, R-squared
print(f'The sum of squared error (SSE): '
      f'{mean_squared_error(y_test, y_predict_knn) * valid_len}')
print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, y_predict_knn)}')
print(f'Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_predict_knn)}')
print(f'R-Squared: {r2_score(y_test, y_predict_knn)}')

The sum of squared error (SSE): 2808.326728395061
Mean Squared Error (MSE): 43.880105131172826
Mean Absolute Error (MAE): 4.793749999999999
R-Squared: 0.7250456744971305
