# House Price Prediction Project

In [72]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

In [4]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df = pd.read_csv('./train.csv', index_col='Id')
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df.columns # See: https://www.kaggle.com/competitions/home-data-for-ml-course/data

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## Feature Selections

In [7]:
features = [
    'LotArea', # Lot size in square feet
    'YearBuilt', # Original construction date
    '1stFlrSF', # First floor square feet
    '2ndFlrSF', # Second floor square feet
    'FullBath', # Full bathrooms above grade
    'BedroomAbvGr', # Total bed room above grade
    'TotRmsAbvGrd', # Total rooms above grade (does not include bathrooms)
]

## Splitting dataset into training set and test set

In [10]:
X = df[features]
y = df['SalePrice']

In [11]:
X.head(), y.head()

(    LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
 Id                                                                   
 1      8450       2003       856       854         2             3   
 2      9600       1976      1262         0         2             3   
 3     11250       2001       920       866         2             3   
 4      9550       1915       961       756         1             3   
 5     14260       2000      1145      1053         2             4   
 
     TotRmsAbvGrd  
 Id                
 1              8  
 2              6  
 3              6  
 4              7  
 5              9  ,
 Id
 1    208500
 2    181500
 3    223500
 4    140000
 5    250000
 Name: SalePrice, dtype: int64)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)

In [25]:
X_train.shape, y_train.shape

((1168, 7), (1168,))

In [26]:
X_test.shape, y_test.shape

((292, 7), (292,))

## Training ML Model

In [66]:
# DecisionTreeRegressor Model
dt_model = DecisionTreeRegressor(random_state=1)
dt_model.fit(X_train, y_train)
dt_val_preds = dt_model.predict(X_test)

In [54]:
pd.DataFrame({
    'y': y_test,
    'y_preds': dt_val_preds,
})

Unnamed: 0_level_0,y,y_preds
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
530,200624,335000.0
492,133000,140200.0
460,110000,119000.0
280,192000,207500.0
656,88000,112000.0
...,...,...
327,324000,260400.0
441,555000,451950.0
1388,136000,107500.0
1324,82500,72500.0


In [67]:
# RandomForestRegressor Model
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train, y_train)
rf_val_preds = rf_model.predict(X_test)

In [68]:
pd.DataFrame({
    'y': y_test,
    'y_preds': rf_val_preds,
})

Unnamed: 0_level_0,y,y_preds
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
530,200624,271690.00
492,133000,155039.00
460,110000,122024.00
280,192000,188915.00
656,88000,91147.00
...,...,...
327,324000,275931.87
441,555000,478954.15
1388,136000,176038.00
1324,82500,78096.00


### Predict with a new input

In [57]:
X_test.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
530,32668,1957,2515,0,3,4,9
492,9490,1941,958,620,1,3,5
460,7015,1950,979,224,1,3,5
280,10005,1977,1156,866,2,4,8
656,1680,1971,525,567,1,3,6


In [65]:
inp_data = pd.DataFrame([[6969, 2021, 1000, 800, 4, 5, 8]], columns=features) 
prediction = rf_model.predict(inp_data)
prediction 

array([206158.4])

## Model Evalution
- MSE and MAE: Lower values are better, indicating that the predictions are closer to the actual values.
- R² Score: Values closer to 1 indicate a better fit. Negative values suggest that the model performs worse than a horizontal line.

In [70]:
# Evaluate the DecisionTreeRegressor
dt_mse = mean_squared_error(y_test, dt_val_preds)
dt_mae = mean_absolute_error(y_test, dt_val_preds)
dt_r2 = r2_score(y_test, dt_val_preds)

In [76]:
print(f"DecisionTreeRegressor - MSE: {dt_mse}, MAE: {dt_mae}, R²: {dt_r2}")

DecisionTreeRegressor - MSE: 3228225028.8561645, MAE: 33842.32876712329, R²: 0.5325376245236801


In [71]:
# Evaluate the RandomForestRegressor Model
rf_mse = mean_squared_error(y_test, rf_val_preds)
rf_mae = mean_absolute_error(y_test, rf_val_preds)
rf_r2 = r2_score(y_test, rf_val_preds)

In [75]:
print(f"RandomForestRegressor - MSE: {rf_mse}, MAE: {rf_mae}, R²: {rf_r2}")

RandomForestRegressor - MSE: 1489870679.523267, MAE: 24069.385498858446, R²: 0.7842596223072973


### Cross-Validation (Optional)
- Cross-validation to get a more robust estimate of your model’s performance
- Cross-validation helps in understanding how well your model generalizes to unseen data.

In [74]:
# Cross-validation for DecisionTreeRegressor
dt_cv_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='r2')

In [77]:
print(f"RandomForestRegressor - Cross-Validated R²: {rf_cv_scores.mean()}")

RandomForestRegressor - Cross-Validated R²: 0.7895792669516719


In [73]:
# Cross-validation for RandomForestRegressor
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='r2')

In [78]:
print(f"DecisionTreeRegressor - Cross-Validated R²: {dt_cv_scores.mean()}")

DecisionTreeRegressor - Cross-Validated R²: 0.6516411280419957
