In [67]:
# import library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [16]:
# Dataset path
path = '../datasets/Melbourne_housing/Melbourne_housing_FULL.csv'

In [19]:
# Read in data from CSV as a Pandas dataframe
df = pd.read_csv(path)

In [20]:
# Looking at the top five rows of the dataframe
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [21]:
# Finding a row item
df.iloc[10]

Suburb                      Abbotsford
Address                 129 Charles St
Rooms                                2
Type                                 h
Price                         941000.0
Method                               S
SellerG                         Jellis
Date                         7/05/2016
Distance                           2.5
Postcode                        3067.0
Bedroom2                           2.0
Bathroom                           1.0
Car                                0.0
Landsize                         181.0
BuildingArea                       NaN
YearBuilt                          NaN
CouncilArea         Yarra City Council
Lattitude                     -37.8041
Longtitude                    144.9953
Regionname       Northern Metropolitan
Propertycount                   4019.0
Name: 10, dtype: object

In [22]:
# Priting the columns
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [23]:
# Deleting the columns we are not going to work with

del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [24]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,,2.5,2.0,1.0,1.0,126.0,,,Yarra City Council
1,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
3,Abbotsford,3,u,,2.5,3.0,2.0,1.0,0.0,,,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council


In [25]:
df.shape

(34857, 12)

In [27]:
# There seems to be many columns with missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Suburb        34857 non-null  object 
 1   Rooms         34857 non-null  int64  
 2   Type          34857 non-null  object 
 3   Price         27247 non-null  float64
 4   Distance      34856 non-null  float64
 5   Bedroom2      26640 non-null  float64
 6   Bathroom      26631 non-null  float64
 7   Car           26129 non-null  float64
 8   Landsize      23047 non-null  float64
 9   BuildingArea  13742 non-null  float64
 10  YearBuilt     15551 non-null  float64
 11  CouncilArea   34854 non-null  object 
dtypes: float64(8), int64(1), object(3)
memory usage: 3.2+ MB


In [28]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt
count,34857.0,27247.0,34856.0,26640.0,26631.0,26129.0,23047.0,13742.0,15551.0
mean,3.031012,1050173.0,11.184929,3.084647,1.624798,1.728845,593.598993,160.2564,1965.289885
std,0.969933,641467.1,6.788892,0.98069,0.724212,1.010771,3398.841946,401.26706,37.328178
min,1.0,85000.0,0.0,0.0,0.0,0.0,0.0,0.0,1196.0
25%,2.0,635000.0,6.4,2.0,1.0,1.0,224.0,102.0,1940.0
50%,3.0,870000.0,10.3,3.0,2.0,2.0,521.0,136.0,1970.0
75%,4.0,1295000.0,14.0,4.0,2.0,2.0,670.0,188.0,2000.0
max,16.0,11200000.0,48.1,30.0,12.0,26.0,433014.0,44515.0,2106.0


In [30]:
# For the sake of the simplicity we are going to delete all the rows which have any empty value.
# Note: This approch is not a good way to work with this dataset as about 50% of the rows will be deleted.
# But we are doing it just to keep this analysis simple.

df.dropna(axis = 0, how = 'any', subset = None, inplace = True)

In [31]:
df.shape

(8895, 12)

In [36]:
## Oh my God! What we have done here.
# Almost, 75% data is lost. That is why we say cleaning data takes 80-90% of the time in our journey...
#We should have replaced the empty values with either mean, mode or Median.
# But since, it is a beginner level notebook, let us be a beginner and have faith on dropna command lol.

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8895 entries, 2 to 34856
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Suburb        8895 non-null   object 
 1   Rooms         8895 non-null   int64  
 2   Type          8895 non-null   object 
 3   Price         8895 non-null   float64
 4   Distance      8895 non-null   float64
 5   Bedroom2      8895 non-null   float64
 6   Bathroom      8895 non-null   float64
 7   Car           8895 non-null   float64
 8   Landsize      8895 non-null   float64
 9   BuildingArea  8895 non-null   float64
 10  YearBuilt     8895 non-null   float64
 11  CouncilArea   8895 non-null   object 
dtypes: float64(8), int64(1), object(3)
memory usage: 903.4+ KB


In [33]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt
count,8895.0,8895.0,8895.0,8895.0,8895.0,8895.0,8895.0,8895.0,8895.0
mean,3.09882,1092524.0,11.202136,3.078134,1.646655,1.692636,523.395166,149.295708,1965.777403
std,0.96359,679206.6,6.815113,0.96607,0.721388,0.975393,1060.940841,87.898565,37.055054
min,1.0,131000.0,0.0,0.0,1.0,0.0,0.0,0.0,1196.0
25%,2.0,640500.0,6.4,2.0,1.0,1.0,212.0,100.0,1945.0
50%,3.0,900000.0,10.2,3.0,2.0,2.0,478.0,132.0,1970.0
75%,4.0,1345000.0,13.9,4.0,2.0,2.0,652.0,180.0,2000.0
max,12.0,9000000.0,47.4,12.0,9.0,10.0,42800.0,3112.0,2019.0


In [41]:
df

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
6,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council
11,Abbotsford,3,h,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra City Council
14,Abbotsford,2,h,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra City Council
...,...,...,...,...,...,...,...,...,...,...,...,...
34847,Wollert,3,h,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,Whittlesea City Council
34849,Wollert,3,h,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,Whittlesea City Council
34853,Yarraville,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council
34854,Yarraville,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council


In [39]:
## Okay as we can see we three columns as categorical columns (Suburb, CouncilArea, Type) .
## Let us do some one-hot encoding on them and change them to numeric values
# To do that, we will get some dummies from pandas. I mean get_dummies method.

In [42]:
df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

In [43]:
df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,True,...,False,False,False,False,False,True,False,True,False,False
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,True,...,False,False,False,False,False,True,False,True,False,False
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,True,...,False,False,False,False,False,True,False,True,False,False
11,3,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,True,...,False,False,False,False,False,True,False,True,False,False
14,2,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,True,...,False,False,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34847,3,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,False,...,False,False,False,True,False,False,False,True,False,False
34849,3,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,False,...,False,False,False,True,False,False,False,True,False,False
34853,2,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,False,...,False,False,False,False,False,False,False,True,False,False
34854,2,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,False,...,False,False,False,False,False,False,False,False,True,False


In [45]:
## Oh there you go, now we have 360 columns rather than 12 columns.
## Welcome to one-hot encoding world.

In [46]:
# Lastly let us segregate the independent features with dependent variable (our one and only X and y)

X = df.drop('Price', axis = 1)
y = df['Price']

In [49]:
# Now let us split our world. Sorry, I mean our dataset X (I don't mean Twitter here though)
# We will split it in 70/30 split. We will be using Scikit-Learn train_test_split method. And let us able to replicate our splited datasets.
# For that, we will use random_state parameter. (because random_state is the only constant ;p) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 0)


In [63]:
# Selecting our algorithm and then fine tune our Hyperparameter. (which make us hyper)

model = ensemble.GradientBoostingRegressor(
        n_estimators = 250, # n_estimators -> states the number of decision tree you want to grow in your model. (I am too much)
        learning_rate = 0.1, # Controls the rate at which additional decision trees influence the overall prediction
        max_depth = 5, # Define the depth of your each decision tree (You don't want too big or too small tree right)
        min_samples_split = 10, # Define the minimum sample requires to make a new split in the tree
        min_samples_leaf = 6, # Define the minimum number of samples that must appear in the life before a new branch can be formed
        max_features = 0.6, # Define the total number of featuress presented to the model when determining the best split
        loss = 'huber'
)


In [64]:
# Training our model's biceps (and ofcourse it will take time)
model.fit(X_train, y_train)

In [65]:
# Let us see our model performance in his own world (train dataset)
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.2f" % mae_train)

Training Set Mean Absolute Error: 125148.31


In [66]:
# Let us see our model performance in the outer Universe (test dataset)
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.2f" % mae_test)

Test Set Mean Absolute Error: 158883.26


In [68]:
# Input Algorithm
model = ensemble.GradientBoostingRegressor()


In [70]:
# Setting up the configurations that we wish to test. To minimize processing time, limit num. of variables or experiment on each hyperparameter saperately.

hyperparameters = {
    'n_estimators' : [200, 300],
    'max_depth' : [8, 6],
    'min_samples_split' : [8, 10],
    'min_samples_leaf' : [5, 6],
    'learning_rate' : [0.01, 0.02],
    'max_features' : [0.8, 0.9],
    'loss' : ['ls', 'lad', 'huber']
}

In [71]:
# Define grid search and learn in parallel by make n_jobs more than one or wait for a movie to end

grid = GridSearchCV(model, hyperparameters, n_jobs = 6)

In [76]:
# Now let us run grid search on training data

grid.fit(X_train, y_train)

640 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/deep/anaconda3/envs/homl3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/deep/anaconda3/envs/homl3/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/deep/anaconda3/envs/homl3/lib/python3.10/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/deep/anaconda3/envs/homl3/lib/python3.10/site-packages/sklearn/utils/_param_validat

In [77]:
# The return gratitude from grid search side (I am optimal hyperparameters)
grid.best_params_

{'learning_rate': 0.02,
 'loss': 'huber',
 'max_depth': 8,
 'max_features': 0.8,
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 300}

In [78]:
# Checking model accuracy using optimal hyperparameters
mae_train = mean_absolute_error(y_train, grid.predict(X_train))
print("Training Set Mean Absolute Error: %.2f" % mae_train)

Training Set Mean Absolute Error: 109830.04


In [79]:
mae_test = mean_absolute_error(y_test, grid.predict(X_test))
print("Testing Set Mean Absolute Error: %.2f" % mae_test)

Testing Set Mean Absolute Error: 163529.81
