# House price prediction model with random forest

In [2]:
# %pip install pipenv
# %pip install scikit-learn
# %pip install seaborn
# %pip install matplotlib
# %pip install numpy
# %pip install tensorflow
# %pip install xgboost
# %pip install ipympl
%matplotlib widget


In [3]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

## Loading data

In [4]:
train_dataset = pd.read_csv('train.csv')
train_dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
X_all = train_dataset.iloc[:,:-1]
Y_all = train_dataset.iloc[:,-1] # 1d vector


## Features selection

In [6]:
selected_numerical_features = [
    'OverallQual', 'YearBuilt', 'Fireplaces', 'GarageArea',
    'LotArea', 'GrLivArea',
    'YearRemodAdd', 'TotRmsAbvGrd',
    'WoodDeckSF', 
]

one_hot_features = [
'MSZoning',
'LotShape',
'LandContour',
'LotConfig',
'LandSlope',
'Neighborhood',
'BldgType',
'HouseStyle',
# 'RoofStyle',
# 'Exterior1st',
# 'Exterior2nd',
# 'Foundation',
# 'CentralAir',
# 'Electrical',
# 'GarageType',
'GarageFinish',
'SaleType',
# 'SaleCondition',
# 'RoofMatl' #Missing ['ClyTile', 'Metal'] in train data
# 'BldgType'
]

ordinal_values_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_features = [
    'ExterQual',
    'ExterCond',
    # 'HeatingQC',
    # 'KitchenQual'
]
ordinal_value_per_feature = len(ordinal_features)*[ordinal_values_order]

## Preprocessing transformer

In [11]:
def build_preprocessor():
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler(),
    )
    numeric_transformer_2 = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler() 
    )
    ordinal_encoder = make_pipeline(
        OrdinalEncoder(categories=ordinal_value_per_feature, 
            encoded_missing_value=ordinal_values_order.index('TA')),
        StandardScaler()
    )
    one_hot_encoded = make_pipeline(
        SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse_output=False)
    )
    transformer = ColumnTransformer(transformers=[
           ('numeric1', numeric_transformer, selected_numerical_features),
           ('numeric2', numeric_transformer_2, ['MasVnrArea']),
           ('one_hot', one_hot_encoded, one_hot_features),
           ('ordinal', ordinal_encoder, ordinal_features),
        ])
    return transformer



## Finding the best parameters for random forest regressor with grid cross-validation

In [12]:
min_samples_split_list = [2,10, 30, 50]  ## If the number is an integer, then it is the actual quantity of samples,
                                             ## If it is a float, then it is the percentage of the dataset
max_depth_list = [16, 32, 64, None]
n_estimators_list = [10,50,100,500]
param_grid = {
    'min_samples_split': min_samples_split_list,
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
}

p = build_preprocessor()
X_train_scaled = p.fit_transform(X_all)


In [36]:

from collections import namedtuple
Model = namedtuple('Model', ['model','score','rmse_log'])


In [46]:

def fit_model(model):
    model.fit(X_train_scaled, Y_all)
    if hasattr(model, 'best_params_'):
        print('Best params:', model.best_params_)
        print('Best score:', model.best_score_)
    Y_all_predict = model.predict(X_train_scaled)
    score = model.score(X_train_scaled, Y_all)
    rmse_log_score = root_mean_squared_error(np.log(Y_all+0.001), np.log(Y_all_predict+0.001))
    print(f'R2={score:.2f}, RMSE-LOG={rmse_log_score:.2f}')
    return Model(model=model, score=score, rmse_log=rmse_log_score)

# rf_model_grid = fit_model(model=GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=2, verbose=3))


In [44]:

best_rf_model =  fit_model(RandomForestRegressor(max_depth=64, min_samples_split=2, n_estimators=500))
print(f'R2={best_rf_model.score:.2f}, RMSE-LOG={best_rf_model.rmse_log:.2f}')


RMSE-log score: 0.06233802800278064
R2=0.98, RMSE-LOG=0.06


### Finding the best parameters for boosted trees

In [45]:
param_grid = {
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
}

# xb_model_grid = fit_model(model=GridSearchCV(XGBRegressor(), param_grid, n_jobs=2, verbose=3))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 2/5] END .....max_depth=16, n_estimators=10;, score=0.773 total time=   0.1s
[CV 1/5] END .....max_depth=16, n_estimators=10;, score=0.822 total time=   0.1s
[CV 3/5] END .....max_depth=16, n_estimators=10;, score=0.805 total time=   0.1s
[CV 4/5] END .....max_depth=16, n_estimators=10;, score=0.845 total time=   0.1s
[CV 5/5] END .....max_depth=16, n_estimators=10;, score=0.767 total time=   0.1s
[CV 1/5] END .....max_depth=16, n_estimators=50;, score=0.820 total time=   0.4s
[CV 2/5] END .....max_depth=16, n_estimators=50;, score=0.764 total time=   0.4s
[CV 3/5] END .....max_depth=16, n_estimators=50;, score=0.820 total time=   0.3s
[CV 4/5] END .....max_depth=16, n_estimators=50;, score=0.848 total time=   0.4s
[CV 5/5] END .....max_depth=16, n_estimators=50;, score=0.784 total time=   0.5s
[CV 1/5] END ....max_depth=16, n_estimators=100;, score=0.820 total time=   0.7s
[CV 2/5] END ....max_depth=16, n_estimators=100;

In [48]:
best_xb_model = fit_model(XGBRegressor(max_depth=None, n_estimators=50))


R2=0.99, RMSE-LOG=0.05


## Produce test prediction

In [50]:
X_test = pd.read_csv('test.csv')
X_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [53]:
X_test_scaled = p.transform(X_test)
Y_test_predict = best_xb_model.model.predict(X_test_scaled)

In [57]:
df_output = pd.DataFrame(data={'Id': X_test['Id'], 'SalePrice': Y_test_predict})
df_output.head()

Unnamed: 0,Id,SalePrice
0,1461,117282.984375
1,1462,142971.578125
2,1463,189735.53125
3,1464,180100.5625
4,1465,179582.296875


In [56]:
df_output.to_csv('./C_submission.csv', index=False)