# House price prediction model with random forest

In [2]:
# %pip install pipenv
# %pip install scikit-learn
# %pip install seaborn
# %pip install matplotlib
# %pip install numpy
# %pip install tensorflow
# %pip install xgboost
# %pip install ipympl
%matplotlib widget


In [3]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt

## Loading data

In [4]:
train_dataset = pd.read_csv('train.csv')
train_dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
X_all = train_dataset.iloc[:,:-1]
Y_all = train_dataset.iloc[:,-1] # 1d vector


## Features selection

In [6]:
selected_numerical_features = [
    'OverallQual', 'YearBuilt', 'Fireplaces', 'GarageArea',
    'LotArea', 'GrLivArea',
    'YearRemodAdd', 'TotRmsAbvGrd',
    'WoodDeckSF', 
]

one_hot_features = [
'MSZoning',
'LotShape',
'LandContour',
'LotConfig',
'LandSlope',
'Neighborhood',
'BldgType',
'HouseStyle',
# 'RoofStyle',
# 'Exterior1st',
# 'Exterior2nd',
# 'Foundation',
# 'CentralAir',
# 'Electrical',
# 'GarageType',
'GarageFinish',
'SaleType',
# 'SaleCondition',
# 'RoofMatl' #Missing ['ClyTile', 'Metal'] in train data
# 'BldgType'
]

ordinal_values_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_features = [
    'ExterQual',
    'ExterCond',
    # 'HeatingQC',
    # 'KitchenQual'
]
ordinal_value_per_feature = len(ordinal_features)*[ordinal_values_order]

## Preprocessing transformer

In [7]:
def build_preprocessor():
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler(),
    )
    numeric_transformer_2 = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler() 
    )
    ordinal_encoder = make_pipeline(
        OrdinalEncoder(categories=ordinal_value_per_feature, 
            encoded_missing_value=ordinal_values_order.index('TA')),
        StandardScaler()
    )
    one_hot_encoded = make_pipeline(
        SimpleImputer(strategy='most_frequent'), OneHotEncoder()
    )
    transformer = ColumnTransformer(transformers=[
           ('numeric1', numeric_transformer, selected_numerical_features),
           ('numeric2', numeric_transformer_2, ['MasVnrArea']),
           ('one_hot', one_hot_encoded, one_hot_features),
           ('ordinal', ordinal_encoder, ordinal_features),
        ])
    return transformer



## Finding the best parameters for random forest regressor with grid cross-validation

In [8]:
min_samples_split_list = [2,10, 30, 50]  ## If the number is an integer, then it is the actual quantity of samples,
                                             ## If it is a float, then it is the percentage of the dataset
max_depth_list = [16, 32, 64, None]
n_estimators_list = [10,50,100,500]
param_grid = {
    'min_samples_split': min_samples_split_list,
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
}

p = build_preprocessor()
X_train_scaled = p.fit_transform(X_all)


In [10]:

model = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=2, verbose=3)
model.fit(X_train_scaled, Y_all)

print('Best params:', model.best_params_)
print('Best score:', model.best_score_)

Y_all_predict = model.predict(X_train_scaled)
rmse_log_score = root_mean_squared_error(np.log(X_train_scaled+0.001), np.log(Y_all_predict+0.001))
print(f'RMSE-log score: {rmse_log_score}')

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[CV 1/5] END max_depth=16, min_samples_split=2, n_estimators=10;, score=0.819 total time=   0.2s
[CV 2/5] END max_depth=16, min_samples_split=2, n_estimators=10;, score=0.844 total time=   0.2s
[CV 3/5] END max_depth=16, min_samples_split=2, n_estimators=10;, score=0.846 total time=   0.2s
[CV 4/5] END max_depth=16, min_samples_split=2, n_estimators=10;, score=0.847 total time=   0.2s
[CV 5/5] END max_depth=16, min_samples_split=2, n_estimators=10;, score=0.778 total time=   0.2s
[CV 1/5] END max_depth=16, min_samples_split=2, n_estimators=50;, score=0.854 total time=   1.0s
[CV 2/5] END max_depth=16, min_samples_split=2, n_estimators=50;, score=0.836 total time=   0.9s
[CV 3/5] END max_depth=16, min_samples_split=2, n_estimators=50;, score=0.860 total time=   0.9s
[CV 4/5] END max_depth=16, min_samples_split=2, n_estimators=50;, score=0.867 total time=   0.9s
[CV 5/5] END max_depth=16, min_samples_split=2, n_estimators=50;, score=0.799 total time=   1.0s
[CV 1/5] END max_depth=16, min

NotImplementedError: adding a nonzero scalar to a sparse array is not supported

Best params: {'max_depth': 64, 'min_samples_split': 2, 'n_estimators': 500}

Best score: 0.8460436374790623

### Finding the best parameters for boosted trees

In [56]:
param_grid = {
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
}
xmodel = GridSearchCV(XGBRegressor(), param_grid, n_jobs=2, verbose=3)
xmodel.fit(X_train_scaled, Y_all)
print('Best params:', xmodel.best_params_)
print('Best score:', xmodel.best_score_)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 2/5] END .....max_depth=16, n_estimators=10;, score=0.773 total time=   0.1s
[CV 1/5] END .....max_depth=16, n_estimators=10;, score=0.822 total time=   0.3s
[CV 3/5] END .....max_depth=16, n_estimators=10;, score=0.805 total time=   0.2s
[CV 4/5] END .....max_depth=16, n_estimators=10;, score=0.845 total time=   0.1s
[CV 5/5] END .....max_depth=16, n_estimators=10;, score=0.767 total time=   0.1s
[CV 2/5] END .....max_depth=16, n_estimators=50;, score=0.764 total time=   0.3s
[CV 1/5] END .....max_depth=16, n_estimators=50;, score=0.820 total time=   0.3s
[CV 3/5] END .....max_depth=16, n_estimators=50;, score=0.820 total time=   0.3s
[CV 4/5] END .....max_depth=16, n_estimators=50;, score=0.848 total time=   0.3s
[CV 5/5] END .....max_depth=16, n_estimators=50;, score=0.784 total time=   0.3s
[CV 1/5] END ....max_depth=16, n_estimators=100;, score=0.820 total time=   0.8s
[CV 3/5] END ....max_depth=16, n_estimators=100;

Best params: {'max_depth': None, 'n_estimators': 50}

Best score: 0.8298275354431931