# House prediction model
## Simple solution with just few features, using deep learning with tensorflow

In [4]:
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [5]:
dataset = pd.read_csv('train.csv')
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
selected_data = dataset[['LotArea', 'OverallQual', 'OverallCond', 'GrLivArea','SalePrice']]
selected_data.describe()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,6.099315,5.575342,1515.463699,180921.19589
std,9981.264932,1.382997,1.112799,525.480383,79442.502883
min,1300.0,1.0,1.0,334.0,34900.0
25%,7553.5,5.0,5.0,1129.5,129975.0
50%,9478.5,6.0,5.0,1464.0,163000.0
75%,11601.5,7.0,6.0,1776.75,214000.0
max,215245.0,10.0,9.0,5642.0,755000.0


### Drop outliers

In [7]:
filtered1 = selected_data[selected_data['LotArea'] < 20000]

### Prepare input and outputs

In [9]:
inputs = filtered1.iloc[:,:4]
inputs.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea
0,8450,7,5,1710
1,9600,6,8,1262
2,11250,7,5,1786
3,9550,7,5,1717
4,14260,8,5,2198


In [10]:
targets = filtered1.iloc[:,4:]
targets.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


### Split data to training and test

In [11]:
 X_train, X_test, Y_train, Y_test = train_test_split(inputs, targets)

In [12]:
print(X_train.shape)
print(Y_train.shape)

(1055, 4)
(1055, 1)


In [13]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
(X_train_scaled.shape, X_test_scaled.shape)

((1055, 4), (352, 4))

### Defining model

In [18]:
INPUT_SIZE = X_train_scaled.shape[1]
OUTPUT_SIZE = Y_train.shape[1]
HIDDEN_LAYER_SIZE = 300
BATCH_SIZE= 100
MAX_EPOCHS = 100
(INPUT_SIZE, OUTPUT_SIZE)

(4, 1)

In [19]:
def baseline_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, input_shape=(INPUT_SIZE,), activation='relu'),
        tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, activation='relu'),
        tf.keras.layers.Dense(OUTPUT_SIZE)
    ])
    #model.compile(optimizer='adam', loss='mean_squared_error', metrics = ['root_mean_squared_error'])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

### Training

In [20]:
model = baseline_model()
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)
model.fit(X_train_scaled, Y_train, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_split=0.1, 
          callbacks=[early_stopping],
          verbose=2)

Epoch 1/100
10/10 - 0s - loss: 38442954752.0000 - val_loss: 31380062208.0000 - 498ms/epoch - 50ms/step
Epoch 2/100
10/10 - 0s - loss: 38441701376.0000 - val_loss: 31378724864.0000 - 32ms/epoch - 3ms/step
Epoch 3/100
10/10 - 0s - loss: 38439563264.0000 - val_loss: 31376406528.0000 - 30ms/epoch - 3ms/step
Epoch 4/100
10/10 - 0s - loss: 38435811328.0000 - val_loss: 31372591104.0000 - 28ms/epoch - 3ms/step
Epoch 5/100
10/10 - 0s - loss: 38429618176.0000 - val_loss: 31366582272.0000 - 29ms/epoch - 3ms/step
Epoch 6/100
10/10 - 0s - loss: 38420111360.0000 - val_loss: 31357548544.0000 - 28ms/epoch - 3ms/step
Epoch 7/100
10/10 - 0s - loss: 38405730304.0000 - val_loss: 31344609280.0000 - 28ms/epoch - 3ms/step
Epoch 8/100
10/10 - 0s - loss: 38385815552.0000 - val_loss: 31326697472.0000 - 28ms/epoch - 3ms/step
Epoch 9/100
10/10 - 0s - loss: 38358724608.0000 - val_loss: 31302862848.0000 - 30ms/epoch - 3ms/step
Epoch 10/100
10/10 - 0s - loss: 38322651136.0000 - val_loss: 31271968768.0000 - 29ms/epoc

Epoch 82/100
10/10 - 0s - loss: 3847257856.0000 - val_loss: 3455460352.0000 - 27ms/epoch - 3ms/step
Epoch 83/100
10/10 - 0s - loss: 3745942272.0000 - val_loss: 3337888256.0000 - 27ms/epoch - 3ms/step
Epoch 84/100
10/10 - 0s - loss: 3657531136.0000 - val_loss: 3220503296.0000 - 29ms/epoch - 3ms/step
Epoch 85/100
10/10 - 0s - loss: 3569925376.0000 - val_loss: 3118962688.0000 - 29ms/epoch - 3ms/step
Epoch 86/100
10/10 - 0s - loss: 3487816960.0000 - val_loss: 3023263232.0000 - 29ms/epoch - 3ms/step
Epoch 87/100
10/10 - 0s - loss: 3412479744.0000 - val_loss: 2931052544.0000 - 28ms/epoch - 3ms/step
Epoch 88/100
10/10 - 0s - loss: 3340826880.0000 - val_loss: 2845893888.0000 - 29ms/epoch - 3ms/step
Epoch 89/100
10/10 - 0s - loss: 3274717440.0000 - val_loss: 2761804544.0000 - 31ms/epoch - 3ms/step
Epoch 90/100
10/10 - 0s - loss: 3208444928.0000 - val_loss: 2686768384.0000 - 31ms/epoch - 3ms/step
Epoch 91/100
10/10 - 0s - loss: 3148600320.0000 - val_loss: 2616251136.0000 - 29ms/epoch - 3ms/step


<keras.src.callbacks.History at 0x7fc3516c7790>

### Model evaluation

In [22]:
# estimator.fit(X_train_scaled, Y_train)
Y_predicted = model.predict(X_train_scaled)
Y_predicted[Y_predicted < 1] = 1.0
Y_log_predicted = np.log(Y_predicted)
Y_log_train = np.log(Y_train)




In [23]:
rmse = root_mean_squared_error(Y_log_predicted, Y_log_train)
print('RMSE:', rmse)

RMSE: 0.3446343771145354


In [24]:
df_results = pd.DataFrame(data=np.column_stack((Y_train, Y_predicted)), columns=["Y_train", "Y_predicted"])
df_results.head()

Unnamed: 0,Y_train,Y_predicted
0,240000.0,209779.359375
1,97000.0,138057.796875
2,116000.0,127561.117188
3,109500.0,157830.515625
4,159000.0,100057.4375
