In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

In [None]:
df = pd.read_csv("./data/melbourne-housing-market/Melbourne_housing_FULL.csv")

In [None]:
# Get 1st 10 items in dataframe
df.head(10)
# Get # 100th (which is really 101...zero indexing...) listing
# df.iloc[100]

In [None]:
# print dtatset's column titles 
df.columns

In [None]:
# Scrubbing process
# delete unneeded columns - minimize non-numberic info + redundancy + irrelevant info
# ignore misspellings for now...
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

Remaining 11 independent variables = `x`
12th (dependent) variable (price) = `y`
Decision Trees (+ gradient boositng & random forsests) are adept @ managing lg + high dimensional datasets w/ many variables

In [None]:
## Remove missing values
## Ignore for now...for simplicity?
# Pandas method to remove rows w/ missing values 

df.dropna(axis = 0, how = 'any', thresh = None, subset = None,inplace = True)


In [None]:
# One hot encode non-numeric data 
features_df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

In [None]:
# remove `Price` column because = our dependent variable and needs to be separated
# from independent `x` variables
del features_df['Price']

# added from bonus chapter on individual property prediction(s)
# use this temorarily to print variables in correct order
# copy and paste the list and add it to the second notebook to configure 
# the parameters of the target property. The code used to print this list 
# of variables can then be removed.
# cols = features_df.columns.tolist()

# print("house_to_value = [")
# for item in cols:
#     print("\t0, "+"#"+item)
# print("]")

In [None]:
# create `x` & `y` arrays
X = features_df.values 
y = df['Price'].values 

In [None]:
# Split dataset into training + testing 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, shuffle = True)

In [None]:
# Select an algorithm + configure its parameters 
# `n_estimators`: int (default=100) => The number of boosting stages (trees) to perform
# `learning_rate`: float, optional (default=0.1) => shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.
# `max_depth`:  integer, optional (default=3) =limits the number of nodes in the tree.
# `min_samples_split`:  int, float, optional (default=2) =>  minimum number of samples required to split an internal node:
# `min_samples_leaf`: int, float, optional (default=1) => minimum number of samples required to be at a leaf node
# `max_features`: int, float, string or None, optional (default=None) => number of features to consider when looking for the best split
#        => If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
# `loss` => calculates error rate => huber protects against outliers and anomolies
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
model = ensemble.GradientBoostingRegressor(
    n_estimators = 150,
    learning_rate = 0.1,
    max_depth = 30,
    min_samples_split = 4,
    min_samples_leaf = 6,
    max_features = 0.6,
    loss = 'huber'
)

In [None]:
# Commence model training process 
model.fit(X_train, y_train)

In [None]:
# Save training model as a file ...enables future use w/out having to rebuild model
joblib.dump(model, 'house_trained_model.pk')

In [None]:
# Evaluate Results + check for accuracy (up to 2 decimal places)
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training set Mean Absolute Error: %.2f" % mse)

mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test set Mean Absolute Error: %.2f" % mse)

- On average the training set miscalulated the actual property value by a mere $28,961.66

        - quite an accurate prediction
        
- The test set miscalulated by $172,057.81

        - results are less accurate
        
        - test data => likely to carry slightly different patterns + new outliers & anomolies
        
- a high discrepancy between the training and test data = sign of `overfitting`


- `max_depth = 30` => tends towards overfitting 

- training and test data are shuffled randomly




#### Model Optimization w/ Supervised Learning

- change `max_depth` from `30` to `5`

        => also reduces processing time

- set `n_estimators` to `250` from `150`

        => smaller gap between training and test 
 
- better to test/modify hyperparemeters individually 

- when selecting/modifying/keeping/dropping features,
  also better to test/modify individually 
  
  
- an alternative to manual trial and error: 


        - `grid search`
        
                - list a range of configurations to test for each parameter
                
                - voting process decides best model 

        
        
        
        