In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv(r"C:\Users\Dickson\AI_ML Engineer Lab\ML System Pipeline\src\data\raw\real_estate.csv")

In [5]:
df.head()

Unnamed: 0,No,X1_transaction_date,X2_house_age,X3_distance_to_MRT_station,X4_number_of_convenience_stores,X5_latitude,X6_longitude,Y_house_price_of_unit_area
0,1,2013.49816,34.908086,966.407998,1,25.007911,121.582027,27.57333
1,2,2015.802857,26.804818,2732.40969,5,25.045589,121.561605,34.488277
2,3,2014.927976,15.476381,4371.081888,4,24.921589,121.44321,30.5928
3,4,2014.394634,40.689751,3674.513188,9,25.03706,121.493322,38.287216
4,5,2012.624075,34.236559,4042.477682,8,24.918374,121.563345,39.912526


In [7]:
X = df.drop(columns=["Y_house_price_of_unit_area"])

In [8]:
y = df["Y_house_price_of_unit_area"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [12]:
y_pred = lr.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred))

RMSE: 26.95837839067245


In [13]:
from sklearn.metrics import r2_score

In [14]:
r2 = r2_score(y_test, y_pred)

In [15]:
print(f"R2 Score: {r2:.4f}")

R2 Score: 0.7778


### RandomForestRegressor

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
rf_model = RandomForestRegressor(
    n_estimators=200,   
    max_depth=None,     
    random_state=42,
    n_jobs=-1           
)

In [18]:
rf_model.fit(X_train, y_train)

In [19]:
rf_pred = rf_model.predict(X_test)

In [21]:
rf_rmse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

In [22]:
print(f"Random Forest RMSE: {rf_rmse:.2f}")
print(f"Random Forest R2 Score: {rf_r2:.4f}")

Random Forest RMSE: 36.95
Random Forest R2 Score: 0.6955


### Combine as Euclidean Distance to a Reference Point

In [26]:
import numpy as np

In [27]:
ref_lat, ref_lon = 25.0330, 121.5654

In [31]:
df['distance_to_center'] = np.sqrt(
    (df['X5_latitude'] - ref_lat)**2 + 
    (df['X6_longitude'] - ref_lon)**2
)

In [32]:
df.head()

Unnamed: 0,No,X1_transaction_date,X2_house_age,X3_distance_to_MRT_station,X4_number_of_convenience_stores,X5_latitude,X6_longitude,Y_house_price_of_unit_area,distance_to_center
0,1,2013.49816,34.908086,966.407998,1,25.007911,121.582027,27.57333,0.030098
1,2,2015.802857,26.804818,2732.40969,5,25.045589,121.561605,34.488277,0.013149
2,3,2014.927976,15.476381,4371.081888,4,24.921589,121.44321,30.5928,0.165357
3,4,2014.394634,40.689751,3674.513188,9,25.03706,121.493322,38.287216,0.072193
4,5,2012.624075,34.236559,4042.477682,8,24.918374,121.563345,39.912526,0.114644


In [33]:
df = df.drop(columns=['X5_latitude', 'X6_longitude'])

In [34]:
df.head()

Unnamed: 0,No,X1_transaction_date,X2_house_age,X3_distance_to_MRT_station,X4_number_of_convenience_stores,Y_house_price_of_unit_area,distance_to_center
0,1,2013.49816,34.908086,966.407998,1,27.57333,0.030098
1,2,2015.802857,26.804818,2732.40969,5,34.488277,0.013149
2,3,2014.927976,15.476381,4371.081888,4,30.5928,0.165357
3,4,2014.394634,40.689751,3674.513188,9,38.287216,0.072193
4,5,2012.624075,34.236559,4042.477682,8,39.912526,0.114644


In [36]:
X = df.drop(columns=['Y_house_price_of_unit_area'])
y = df['Y_house_price_of_unit_area']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Train Model again

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Linear Regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred_lr = linreg.predict(X_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression -> RMSE: {rmse_lr:.2f}, R2: {r2_lr:.4f}")

# Random Forest
rf = RandomForestRegressor(random_state=42, n_estimators=200)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest -> RMSE: {rmse_rf:.2f}, R2: {r2_rf:.4f}")


Linear Regression -> RMSE: 5.17, R2: 0.7798
Random Forest -> RMSE: 6.19, R2: 0.6843
