In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

In [38]:
df = pd.read_csv("./data/melbourne-housing-market/Melbourne_housing_FULL.csv")

In [39]:
# Get 1st 10 items in dataframe
df.head(10)
# Get # 100th (which is really 101...zero indexing...) listing
# df.iloc[100]

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019.0
6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019.0
7,Abbotsford,16 Maugie St,4,h,,SN,Nelson,6/08/2016,2.5,3067.0,...,2.0,2.0,400.0,220.0,2006.0,Yarra City Council,-37.7965,144.9965,Northern Metropolitan,4019.0
8,Abbotsford,53 Turner St,2,h,,S,Biggin,6/08/2016,2.5,3067.0,...,1.0,2.0,201.0,,1900.0,Yarra City Council,-37.7995,144.9974,Northern Metropolitan,4019.0
9,Abbotsford,99 Turner St,2,h,,S,Collins,6/08/2016,2.5,3067.0,...,2.0,1.0,202.0,,1900.0,Yarra City Council,-37.7996,144.9989,Northern Metropolitan,4019.0


In [40]:
# print dtatset's column titles 
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [41]:
# Scrubbing process
# delete unneeded columns - minimize non-numberic info + redundancy + irrelevant info
# ignore misspellings for now...
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

Remaining 11 independent variables = `x`
12th (dependent) variable (price) = `y`
Decision Trees (+ gradient boositng & random forsests) are adept @ managing lg + high dimensional datasets w/ many variables

In [42]:
## Remove missing values
## Ignore for now...for simplicity?
# Pandas method to remove rows w/ missing values 

df.dropna(axis = 0, how = 'any', thresh = None, subset = None,inplace = True)


In [43]:
# One hot encode non-numeric data 
features_df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

In [44]:
# remove `Price` column because = our dependent variable and needs to be separated
# from independent `x` variables
del features_df['Price']

In [45]:
# create `x` & `y` arrays
X = features_df.values 
y = df['Price'].values 

In [46]:
# Split dataset into training + testing 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, shuffle = True)

In [47]:
# Select an algorithm + configure its parameters 
# `n_estimators`: int (default=100) => The number of boosting stages (trees) to perform
# `learning_rate`: float, optional (default=0.1) => shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.
# `max_depth`:  integer, optional (default=3) =limits the number of nodes in the tree.
# `min_samples_split`:  int, float, optional (default=2) =>  minimum number of samples required to split an internal node:
# `min_samples_leaf`: int, float, optional (default=1) => minimum number of samples required to be at a leaf node
# `max_features`: int, float, string or None, optional (default=None) => number of features to consider when looking for the best split
#        => If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
# `loss` => calculates error rate => huber protects against outliers and anomolies
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
model = ensemble.GradientBoostingRegressor(
    n_estimators = 150,
    learning_rate = 0.1,
    max_depth = 30,
    min_samples_split = 4,
    min_samples_leaf = 6,
    max_features = 0.6,
    loss = 'huber'
)

In [48]:
# Commence model training process 
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='huber', max_depth=30,
             max_features=0.6, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=6, min_samples_split=4,
             min_weight_fraction_leaf=0.0, n_estimators=150,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [49]:
# Save training model as a file ...enables future use w/out having to rebuild model
joblib.dump(model, 'house_trained_model.pk')

['house_trained_model.pk']

In [50]:
# Evaluate Results + check for accuracy (up to 2 decimal places)
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training set Mean Absolute Error: %.2f" % mse)

mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test set Mean Absolute Error: %.2f" % mse)

Training set Mean Absolute Error: 28961.66
Test set Mean Absolute Error: 172057.81


- On average the training set miscalulated the actual property value by a mere $28,961.66

        - quite an accurate prediction
        
- The test set miscalulated by $172,057.81

        - results are less accurate
        
        - test data => likely to carry slightly different patterns + new outliers & anomolies
        
- a high discrepancy between the training and test data = sign of `overfitting`


- `max_depth = 30` => tends towards overfitting 

- training and test data are shuffled randomly




#### Model Optimization w/ Supervised Learning

- change `max_depth` from `30` to `5`

        => also reduces processing time

- set `n_estimators` to `250` from `150`

        => smaller gap between training and test 
 
- better to test/modify hyperparemeters individually 

- when selecting/modifying/keeping/dropping features,
  also better to test/modify individually 
  
  
- an alternative to manual trial and error: 


        - `grid search`
        
                - list a range of configurations to test for each parameter
                
                - voting process decides best model 

        
        
        
        