In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

In [2]:
df = pd.read_csv("./data/melbourne-housing-market/Melbourne_housing_FULL.csv")

In [3]:
# Get 1st 10 items in dataframe
df.head(10)
# Get # 100th (which is really 101...zero indexing...) listing
# df.iloc[100]

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019.0
6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019.0
7,Abbotsford,16 Maugie St,4,h,,SN,Nelson,6/08/2016,2.5,3067.0,...,2.0,2.0,400.0,220.0,2006.0,Yarra City Council,-37.7965,144.9965,Northern Metropolitan,4019.0
8,Abbotsford,53 Turner St,2,h,,S,Biggin,6/08/2016,2.5,3067.0,...,1.0,2.0,201.0,,1900.0,Yarra City Council,-37.7995,144.9974,Northern Metropolitan,4019.0
9,Abbotsford,99 Turner St,2,h,,S,Collins,6/08/2016,2.5,3067.0,...,2.0,1.0,202.0,,1900.0,Yarra City Council,-37.7996,144.9989,Northern Metropolitan,4019.0


In [4]:
# print dtatset's column titles 
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
# Scrubbing process
# delete unneeded columns - minimize non-numberic info + redundancy + irrelevant info
# ignore misspellings for now...
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [6]:
## Remove missing values
## Ignore for now...for simplicity?
# Pandas method to remove rows w/ missing values 

df.dropna(axis = 0, how = 'any', thresh = None, subset = None,inplace = True)


In [7]:
# One hot encode non-numeric data 
features_df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

In [8]:
# remove `Price` column because = our dependent variable and needs to be separated
# from independent `x` variables
del features_df['Price']

In [9]:
# added from bonus chapter on individual property prediction(s)
# use this temorarily to print variables in correct order
# copy and paste the list and add it to the second notebook to configure 
# the parameters of the target property. The code used to print this list 
# of variables can then be removed.
cols = features_df.columns.tolist()

print("house_to_value = [")
for item in cols:
   print("\t0, "+"#"+item)
print("]")

house_to_value = [
	0, #Rooms
	0, #Distance
	0, #Bedroom2
	0, #Bathroom
	0, #Car
	0, #Landsize
	0, #BuildingArea
	0, #YearBuilt
	0, #Suburb_Abbotsford
	0, #Suburb_Aberfeldie
	0, #Suburb_Airport West
	0, #Suburb_Albanvale
	0, #Suburb_Albert Park
	0, #Suburb_Albion
	0, #Suburb_Alphington
	0, #Suburb_Altona
	0, #Suburb_Altona Meadows
	0, #Suburb_Altona North
	0, #Suburb_Ardeer
	0, #Suburb_Armadale
	0, #Suburb_Ascot Vale
	0, #Suburb_Ashburton
	0, #Suburb_Ashwood
	0, #Suburb_Aspendale
	0, #Suburb_Aspendale Gardens
	0, #Suburb_Attwood
	0, #Suburb_Avondale Heights
	0, #Suburb_Bacchus Marsh
	0, #Suburb_Balaclava
	0, #Suburb_Balwyn
	0, #Suburb_Balwyn North
	0, #Suburb_Bayswater
	0, #Suburb_Bayswater North
	0, #Suburb_Beaconsfield
	0, #Suburb_Beaconsfield Upper
	0, #Suburb_Beaumaris
	0, #Suburb_Bellfield
	0, #Suburb_Bentleigh
	0, #Suburb_Bentleigh East
	0, #Suburb_Berwick
	0, #Suburb_Black Rock
	0, #Suburb_Blackburn
	0, #Suburb_Blackburn North
	0, #Suburb_Blackburn South
	0, #Suburb_Bonbeach
	0,

This will print all variables in the correct order (as shown in Figure 1) with a default value of zero. Next, we can copy and paste the list and add it to the second notebook to configure the parameters of the target property. The code used to print this list of variables can then be removed.