# House price prediction with deep learning model

In [13]:
# %pip install pipenv
# %pip install scikit-learn
# %pip install seaborn
# %pip install matplotlib
# %pip install numpy
# %pip install tensorflow
# %pip install xgboost
# %pip install ipympl
# %matplotlib widget


In [14]:
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
# from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [23]:
# FILE_NAME='train.csv'
FILE_NAME='train_inliers.csv'
train_dataset = pd.read_csv(FILE_NAME)
train_dataset.head()


Unnamed: 0.1,Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,...,0,,,,0,2,2008,WD,Normal,208500
1,1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,...,0,,,,0,5,2007,WD,Normal,181500
2,2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,...,0,,,,0,9,2008,WD,Normal,223500
3,3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,...,0,,,,0,2,2006,WD,Abnorml,140000
4,4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,...,0,,,,0,12,2008,WD,Normal,250000


In [24]:
X_all = train_dataset.iloc[:,:-1]
# Y_all = np.log(train_dataset.iloc[:,-1]).to_numpy()
# Y_all = Y_all.reshape(-1, 1)
Y_all = (train_dataset.iloc[:,-1]).to_numpy().reshape(-1,1)

print(X_all.shape, Y_all.shape)
# X_train, X_validate, Y_train, Y_validate = train_test_split(X_all, Y_all, test_size=0.2)
# print(X_train.shape, Y_train.shape, X_validate.shape, Y_validate.shape)

(1422, 81) (1422, 1)


### Features selection

In [25]:
selected_numerical_features = [
    'OverallQual', 'YearBuilt', 'Fireplaces', 'GarageArea',
    'LotArea', 'GrLivArea',
    'YearRemodAdd', 'TotRmsAbvGrd',
    'WoodDeckSF', 
]

one_hot_features = [
'MSZoning',
'LotShape',
'LandContour',
'LotConfig',
'LandSlope',
'Neighborhood',
'BldgType',
'HouseStyle',
'GarageFinish',
'SaleType',

'RoofStyle',
'Exterior1st',
'Exterior2nd',
'Foundation',
'CentralAir',
'Electrical',
'GarageType',
'SaleCondition',
'BldgType'
# 'RoofMatl' #Missing ['ClyTile', 'Metal'] in train data
]

ordinal_values_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_features = [
    'ExterQual',
    'ExterCond',
    'HeatingQC',
    'KitchenQual'
]
ordinal_value_per_feature = len(ordinal_features)*[ordinal_values_order]

### Preprocessing data

In [26]:
def build_preprocessor(degree=2):
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler()
        # PolynomialFeatures(degree=degree, include_bias=False),
        # StandardScaler(with_mean=False)
    )
    numeric_transformer_2 = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler() 
    )
    ordinal_encoder = make_pipeline(
        OrdinalEncoder(categories=ordinal_value_per_feature, 
            encoded_missing_value=ordinal_values_order.index('TA')),
        StandardScaler()
    )
    one_hot_encoded = make_pipeline(
        SimpleImputer(strategy='most_frequent'), 
        OneHotEncoder(sparse_output=False)
    )
    transformer = ColumnTransformer(transformers=[
           ('numeric1', numeric_transformer, selected_numerical_features),
           ('numeric2', numeric_transformer_2, ['MasVnrArea']),
           ('one_hot', one_hot_encoded, one_hot_features),
           ('ordinal', ordinal_encoder, ordinal_features),
        ])
    return transformer



In [27]:
preprocessor = build_preprocessor()


In [28]:
X_train_scaled = preprocessor.fit_transform(X_all)


### Model

In [29]:
INPUT_SIZE = X_train_scaled.shape[1]
OUTPUT_SIZE = 1
HIDDEN_LAYER_SIZE = 2048
BATCH_SIZE= 64
MAX_EPOCHS = 200

def baseline_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, input_shape=(INPUT_SIZE,), activation='relu'),
        tf.keras.layers.Dense(HIDDEN_LAYER_SIZE/2, activation='relu'),
        tf.keras.layers.Dense(HIDDEN_LAYER_SIZE/4, activation='relu'),
        tf.keras.layers.Dense(OUTPUT_SIZE)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=tf.keras.metrics.RootMeanSquaredError())
    return model

### Training

In [30]:
model = baseline_model()
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)
model.fit(X_train_scaled, Y_all, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_split=0.2, 
          callbacks=[early_stopping],
          verbose=2)
Y_predict = model.predict(X_train_scaled)
rmse_score = root_mean_squared_error(np.log(Y_all+0.001), np.log(Y_predict+0.001))
# rmse_score = root_mean_squared_error(Y_all, Y_predict)
r2 = r2_score(Y_all, Y_predict)
print(f'rmse={rmse_score:.2f}, r2={r2:.2f}')

Epoch 1/200
18/18 - 1s - loss: 36333907968.0000 - root_mean_squared_error: 190614.5469 - val_loss: 35068727296.0000 - val_root_mean_squared_error: 187266.4531 - 749ms/epoch - 42ms/step
Epoch 2/200
18/18 - 0s - loss: 32761722880.0000 - root_mean_squared_error: 181002.0000 - val_loss: 25216800768.0000 - val_root_mean_squared_error: 158797.9844 - 338ms/epoch - 19ms/step
Epoch 3/200
18/18 - 0s - loss: 13096873984.0000 - root_mean_squared_error: 114441.5781 - val_loss: 1118121216.0000 - val_root_mean_squared_error: 33438.3203 - 335ms/epoch - 19ms/step
Epoch 4/200
18/18 - 0s - loss: 1841664128.0000 - root_mean_squared_error: 42914.6133 - val_loss: 1048708096.0000 - val_root_mean_squared_error: 32383.7637 - 294ms/epoch - 16ms/step
Epoch 5/200
18/18 - 0s - loss: 1072600128.0000 - root_mean_squared_error: 32750.5742 - val_loss: 940229440.0000 - val_root_mean_squared_error: 30663.1602 - 292ms/epoch - 16ms/step
Epoch 6/200
18/18 - 0s - loss: 828900096.0000 - root_mean_squared_error: 28790.6250 - 

## Test data evaluation

In [31]:
X_test = pd.read_csv('test.csv')
X_test_scaled = preprocessor.transform(X_test)
Y_test_predict = model.predict(X_test_scaled)

ValueError: Found unknown categories [nan] in column 3 during transform