In [1]:
import pandas as pd

In [3]:
# save filepath to variable for
file_path = './Housing Price ML Project/melb_data.csv/melb_data.csv'
# read the data and store data in DataFrame
mel_df = pd.read_csv(file_path)
# print summary of data
print(mel_df.describe())

         Unnamed: 0         Rooms         Price      Distance      Postcode  \
count  18396.000000  18396.000000  1.839600e+04  18395.000000  18395.000000   
mean   11826.787073      2.935040  1.056697e+06     10.389986   3107.140147   
std     6800.710448      0.958202  6.419217e+05      6.009050     95.000995   
min        1.000000      1.000000  8.500000e+04      0.000000   3000.000000   
25%     5936.750000      2.000000  6.330000e+05      6.300000   3046.000000   
50%    11820.500000      3.000000  8.800000e+05      9.700000   3085.000000   
75%    17734.250000      3.000000  1.302000e+06     13.300000   3149.000000   
max    23546.000000     12.000000  9.000000e+06     48.100000   3978.000000   

           Bedroom2      Bathroom           Car       Landsize  BuildingArea  \
count  14927.000000  14925.000000  14820.000000   13603.000000   7762.000000   
mean       2.913043      1.538492      1.615520     558.116371    151.220219   
std        0.964641      0.689311      0.955916 

In [4]:
# print data columns
print(mel_df.columns)

Index(['Unnamed: 0', 'Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method',
       'SellerG', 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom',
       'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea',
       'Lattitude', 'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')


In [5]:
# Get only the price data
mel_price_data = mel_df.Price
print(mel_price_data.head())

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64


In [6]:
# Select multiple columns
sel_col = ['Landsize', 'BuildingArea']
two_columns_of_data = mel_df[sel_col]
two_columns_of_data.describe()

Unnamed: 0,Landsize,BuildingArea
count,13603.0,7762.0
mean,558.116371,151.220219
std,3987.326586,519.188596
min,0.0,0.0
25%,176.5,93.0
50%,440.0,126.0
75%,651.0,174.0
max,433014.0,44515.0


In [7]:
# drop missing value from mel_df
mel_df = mel_df.dropna(axis=0)

# set prediction target. Common convention, y is use to refer to prediction target
y = mel_df.Price

# Select predictors. Use all/any variables except the target
mel_predictors = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea',
                'YearBuilt', 'Lattitude', 'Longtitude']

# let x be the value of the predictors
x = mel_df[mel_predictors]

In [8]:
# import skilearn library for ML
from sklearn.tree import DecisionTreeRegressor

In [9]:
# define model
mel_model = DecisionTreeRegressor() # This is one of the ML model provided in sklearn lib. The tutorial use this model

# Fit model
mel_model.fit(x, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [13]:
print("Making predictions for the following 5 houses:")
print(x.head(25))
print("The predictions are...")
print(mel_model.predict(x.head(25)))

Making predictions for the following 5 houses:
    Rooms  Bathroom  Landsize  BuildingArea  YearBuilt  Lattitude  Longtitude
1       2       1.0     156.0          79.0     1900.0   -37.8079    144.9934
2       3       2.0     134.0         150.0     1900.0   -37.8093    144.9944
4       4       1.0     120.0         142.0     2014.0   -37.8072    144.9941
6       3       2.0     245.0         210.0     1910.0   -37.8024    144.9993
7       2       1.0     256.0         107.0     1890.0   -37.8060    144.9954
11      2       1.0     220.0          75.0     1900.0   -37.8010    144.9989
16      3       2.0     214.0         190.0     2005.0   -37.8085    144.9964
17      2       2.0       0.0          94.0     2009.0   -37.8078    144.9965
22      2       1.0     238.0          97.0     1890.0   -37.8090    144.9976
23      3       2.0     113.0         110.0     1880.0   -37.8056    144.9930
25      3       1.0     138.0         105.0     1890.0   -37.8021    144.9965
27      2       1

In [14]:
# Now will calculate Mean Absolute Error (mae)
# Import sklearn library for the calculation
from sklearn.metrics import mean_absolute_error

In [15]:
predicted_home_price = mel_model.predict(x)
mean_absolute_error(y, predicted_home_price)

434.71594577146544

In [16]:
# split data into training and test set. sklearn has library to automatically does that
from sklearn.model_selection import train_test_split

In [17]:
# The split is based on random number generator.
train_X, val_X, train_y, val_y = train_test_split(x, y, random_state = 0)

In [19]:
# do the same thing again, taking the training set and fit it into the DecisionTreeRegressor model
mel_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predict = mel_model.predict(val_X)
print(mean_absolute_error(val_y, val_predict))

259976.2369270497


In [21]:
# Writing a function to return "mae" based on value of max_leaf_modes
def get_mae(max_leaf_node, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_node, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val  = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

In [22]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5 		 Mean Absolute Error: 347380
Max leaf nodes: 50 		 Mean Absolute Error: 257829
Max leaf nodes: 500 		 Mean Absolute Error: 243176
Max leaf nodes: 5000 		 Mean Absolute Error: 254915


In [23]:
# Instead of using DecisionTreeRegressor which potentially face issue with under- or overfitting
# Here will use another algorithm - random forest; which makes prediction by averaging the prediction of each component tree
# Import RandomForestRegressor from sklearn library
from sklearn.ensemble import RandomForestRegressor

In [25]:
forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

203266.4496449322
