In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error ,mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor


In [33]:
# loading the dataset
CSV_PATH = "house_clean.csv"
df = pd.read_csv(CSV_PATH)
print(df.head())

   Size_sqft  Bedrooms  Bathrooms  YearBuilt     Price  Location_City  \
0   1.030281 -1.463643   0.088986  -1.279342  812100.0              1   
1  -0.482463 -1.463643   1.347506   1.326476  547000.0              1   
2   0.468877  0.007430  -1.169534  -1.339942  693700.0              1   
3   1.079817  0.742966   1.347506  -0.915740  848300.0              1   
4   0.788954  1.478502  -1.169534   0.962873  806000.0              0   

   Location_Rural  Location_Suburb  HouseAge  Rooms_per_1000sqft  \
0               0                0  1.279342           -1.061465   
1               0                0 -1.326476           -0.265637   
2               0                0  1.339942           -0.689547   
3               0                0  0.915740           -0.199111   
4               0                1 -0.962873           -0.311002   

   Size_per_Bedroom  Is_City   LogPrice  
0          3.123085        1  13.607380  
1          1.309520        1  13.212206  
2         -0.163970       

In [34]:
# prepare future target

x  = df.drop(columns=["Price", "LogPrice"])
y = df["Price"]

In [35]:
# split data
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

In [36]:
# train model lr
Lr = LinearRegression()
Lr.fit(x_train , y_train)
Lr_pred = Lr.predict(x_test)
print(Lr_pred[:10])


[656754.66720779 822634.7173445  188637.49473079 594040.9378655
 609615.22485083 411139.21882378 444365.96225017 727107.31998415
 718486.32165982 825315.58832645]


In [37]:
# train model rf
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train , y_train)
rf_pred = rf.predict(x_test)
print(rf_pred[:10])


[789031. 821977. 290899. 557028. 538756. 297368. 396774. 724944. 777862.
 832261.]


In [None]:
#  evaluate performance for both models
def print_metrics(name,  y, ypredict):
    r2  = r2_score(y, ypredict)
    mae = mean_absolute_error(y, ypredict)
    mse = mean_squared_error(y, ypredict)
    rmse = np.sqrt(mse)
    print(f"prediction of {name}")
    print(f"  R²   : {r2:.3f}")        
    print(f"  MAE  : {mae:,.0f}")        
    print(f"  MSE  : {mse:,.0f}")        
    print(f"  RMSE : {rmse:,.0f}")       

print_metrics("Lr"  ,y_test , Lr_pred  )
print_metrics("rf"  ,y_test , rf_pred  )


prediction of Lr
  R²   : 0.848
  MAE  : 63,086
  MSE  : 5,718,940,941
  RMSE : 75,624
prediction of rf
  R²   : 0.859
  MAE  : 52,524
  MSE  : 5,283,317,455
  RMSE : 72,686


In [42]:
# single row check
i = 10
x_one_df = x_test.iloc[[i]]   
y_true   = y_test.iloc[i]     

p_lr_one = float(Lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $366,000
  LR Pred     : $299,971
  RF Pred     : $311,571
