# Model Validation


In [21]:
import importlib
import pandas as pd
import data_preprocessor as dpp
importlib.reload(dpp)

melbourne_data = dpp.process(pd.read_csv('melb_data.csv'))
melbourne_data.describe()


Unnamed: 0,rooms,price,distance,postcode,bedroom2,bathroom,car,landsize,buildingarea,yearbuilt,lattitude,longtitude,propertycount
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1068828.0,9.751097,3101.947708,2.902034,1.57634,1.573596,471.00694,141.568645,1964.081988,-37.807904,144.990201,7435.489509
std,0.971079,675156.4,5.612065,86.421604,0.970055,0.711362,0.929947,897.449881,90.834824,38.105673,0.07585,0.099165,4337.698917
min,1.0,131000.0,0.0,3000.0,0.0,1.0,0.0,0.0,0.0,1196.0,-38.16492,144.54237,389.0
25%,2.0,620000.0,5.9,3044.0,2.0,1.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198,4383.75
50%,3.0,880000.0,9.0,3081.0,3.0,1.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958,6567.0
75%,4.0,1325000.0,12.4,3147.0,3.0,2.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527,10175.0
max,8.0,9000000.0,47.4,3977.0,9.0,8.0,10.0,37000.0,3112.0,2018.0,-37.45709,145.52635,21650.0


Let's confirm there are now NaN values...

In [22]:
melbourne_data.columns

Index(['suburb', 'address', 'rooms', 'type', 'price', 'method', 'sellerg',
       'date', 'distance', 'postcode', 'bedroom2', 'bathroom', 'car',
       'landsize', 'buildingarea', 'yearbuilt', 'councilarea', 'lattitude',
       'longtitude', 'regionname', 'propertycount'],
      dtype='object')

In [23]:

for col in melbourne_data.columns:
    print(f'{col} has NaN') if melbourne_data[col].hasnans  else None

## Selecting the Prediction Target

In [24]:
model_features = ['rooms', 'bathroom', 'landsize', 'lattitude', 'buildingarea', 'yearbuilt', 'lattitude', 'longtitude']

y = melbourne_data.price
X = melbourne_data[model_features]

X.describe()

Unnamed: 0,rooms,bathroom,landsize,lattitude,buildingarea,yearbuilt,lattitude.1,longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,141.568645,1964.081988,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,90.834824,38.105673,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,0.0,1196.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,91.0,1940.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,124.0,1970.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,170.0,2000.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,3112.0,2018.0,-37.45709,145.52635


In [25]:
X.head()

Unnamed: 0,rooms,bathroom,landsize,lattitude,buildingarea,yearbuilt,lattitude.1,longtitude
1,2,1.0,156.0,-37.8079,79.0,1900.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,150.0,1900.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,142.0,2014.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,210.0,1910.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,107.0,1890.0,-37.806,144.9954


## Training with Out-of-Sample Data
We should not test our model with the data we used to train it.  We can split the data into two pieces, so the model is trained and tested on different data.  This is called **splitting** the data.

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    return mean_absolute_error(val_y, preds_val)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
melbourne_model = DecisionTreeRegressor(random_state=1)
melbourne_model.fit(train_X, train_y)

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f'Max leaf nodes: {max_leaf_nodes} \t\t Mean Absolute Error: {my_mae}')

Max leaf nodes: 5 		 Mean Absolute Error: 347380.33833344496
Max leaf nodes: 50 		 Mean Absolute Error: 258171.21202406782
Max leaf nodes: 500 		 Mean Absolute Error: 249434.099896096
Max leaf nodes: 5000 		 Mean Absolute Error: 261836.22401549388


The out of sample mean absolute error (MAE) is the average error we see when we use our model on data that it was not trained on.  This tells us how well our model is likely to do when used in the real world. In this case, we performed horribly and we need to improve the model.