In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv('Real_Estate.csv')

In [3]:
data

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.01500,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.01440,2,25.012148,121.546990,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.67100,10,25.003850,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.06700,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.40000,6,25.011037,121.479946,21.654710
...,...,...,...,...,...,...,...
409,2013-07-25 15:30:36.565239,18.3,170.12890,6,24.981186,121.486798,29.096310
410,2013-07-26 17:16:34.019780,11.9,323.69120,2,24.950070,121.483918,33.871347
411,2013-07-28 21:47:23.339050,0.0,451.64190,8,24.963901,121.543387,25.255105
412,2013-07-29 13:33:29.405317,35.9,292.99780,5,24.997863,121.558286,25.285620


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Transaction date                     414 non-null    object 
 1   House age                            414 non-null    float64
 2   Distance to the nearest MRT station  414 non-null    float64
 3   Number of convenience stores         414 non-null    int64  
 4   Latitude                             414 non-null    float64
 5   Longitude                            414 non-null    float64
 6   House price of unit area             414 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 22.8+ KB


In [5]:
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
data['Transaction date'] = pd.to_datetime(data['Transaction date'])

In [7]:
data['Transaction year'] = data['Transaction date'].dt.year
data['Transaction month'] = data['Transaction date'].dt.month

In [8]:
data.drop(columns = ['Transaction date'], inplace=True)

In [9]:
data

Unnamed: 0,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area,Transaction year,Transaction month
0,13.3,4082.01500,8,25.007059,121.561694,6.488673,2012,9
1,35.5,274.01440,2,25.012148,121.546990,24.970725,2012,9
2,1.1,1978.67100,10,25.003850,121.528336,26.694267,2012,9
3,22.2,1055.06700,5,24.962887,121.482178,38.091638,2012,9
4,8.5,967.40000,6,25.011037,121.479946,21.654710,2012,9
...,...,...,...,...,...,...,...,...
409,18.3,170.12890,6,24.981186,121.486798,29.096310,2013,7
410,11.9,323.69120,2,24.950070,121.483918,33.871347,2013,7
411,0.0,451.64190,8,24.963901,121.543387,25.255105,2013,7
412,35.9,292.99780,5,24.997863,121.558286,25.285620,2013,7


In [10]:
X = data.drop('House price of unit area', axis=1)
y = data['House price of unit area']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

In [12]:
X_train.shape, y_train.shape

((331, 7), (331,))

In [13]:
sclr = StandardScaler()

X_train_scaled = sclr.fit_transform(X_train)
X_test_scaled = sclr.transform(X_test)

In [14]:
X_train_scaled.shape, X_test_scaled.shape

((331, 7), (83, 7))

In [15]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns= X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score

In [17]:
models = {
    "Linear Regression" : LinearRegression(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest" : RandomForestRegressor(),
    "Gradient Boost" : GradientBoostingRegressor(),
    "KNN" : KNeighborsRegressor(),
    "SVM" : SVR()
}

results = {}

In [18]:
for name, model in models.items():
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    results[name] = {"MAE" : mae, "R2 Score" : r2, "MSE" : mse, "RMSE" : rmse}

result_df = pd.DataFrame(results).T
result_df.sort_values('MAE', ascending= True)

Unnamed: 0,MAE,R2 Score,MSE,RMSE
Linear Regression,9.272874,0.564268,124.968366,11.178925
Random Forest,9.735451,0.525011,136.227189,11.67164
Gradient Boost,10.065298,0.498804,143.743438,11.989305
KNN,11.381484,0.335007,190.720498,13.810159
Decision Tree,11.647134,0.268063,209.920147,14.488621
SVM,11.772802,0.291266,203.265487,14.257121
