In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('melb_data.csv')

data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [3]:
y = data.Price

X = data.drop(['Price'], axis=1)

In [4]:
X.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [5]:
# Columns with over 100 missing values

them = [col for col in X.columns if X[col].isnull().sum() > 100]
them

['BuildingArea', 'YearBuilt', 'CouncilArea']

In [6]:
X.drop(them, axis=1, inplace=True)

In [7]:
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067,2,1,1.0,202,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/2/2016,2.5,3067,2,1,0.0,156,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/3/2017,2.5,3067,3,2,0.0,134,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,PI,Biggin,4/3/2017,2.5,3067,3,2,1.0,94,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,VB,Nelson,4/6/2016,2.5,3067,3,1,2.0,120,-37.8072,144.9941,Northern Metropolitan,4019


In [8]:
# Selecting only the categorical columns

sub_set = X.select_dtypes('object')
sub_set.head()

Unnamed: 0,Suburb,Address,Type,Method,SellerG,Date,Regionname
0,Abbotsford,85 Turner St,h,S,Biggin,3/12/2016,Northern Metropolitan
1,Abbotsford,25 Bloomburg St,h,S,Biggin,4/2/2016,Northern Metropolitan
2,Abbotsford,5 Charles St,h,SP,Biggin,4/3/2017,Northern Metropolitan
3,Abbotsford,40 Federation La,h,PI,Biggin,4/3/2017,Northern Metropolitan
4,Abbotsford,55a Park St,h,VB,Nelson,4/6/2016,Northern Metropolitan


In [9]:
sub_set.nunique()

Suburb          314
Address       13378
Type              3
Method            5
SellerG         268
Date             58
Regionname        8
dtype: int64

In [10]:
# Columns with many unique values

bad_cols = ['Suburb', 'Address', 'SellerG', 'Date'] 
bad_cols

['Suburb', 'Address', 'SellerG', 'Date']

In [11]:
sub_set = sub_set.drop(bad_cols, axis=1)

# Columns with low cardinality

sub_set.head()

Unnamed: 0,Type,Method,Regionname
0,h,S,Northern Metropolitan
1,h,S,Northern Metropolitan
2,h,SP,Northern Metropolitan
3,h,PI,Northern Metropolitan
4,h,VB,Northern Metropolitan


In [12]:
# Now lets label encode the sub_set with the good columns

encoder = LabelEncoder()

encoded_sub = sub_set.copy()

for col in sub_set.columns:
    encoded_sub[col] = encoder.fit_transform(sub_set[col])

encoded_sub.head()

Unnamed: 0,Type,Method,Regionname
0,0,1,2
1,0,1,2
2,0,3,2
3,0,0,2
4,0,4,2


In [13]:
# Remove all objects from X

new_X = X.select_dtypes(exclude='object')

new_X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount
0,2,2.5,3067,2,1,1.0,202,-37.7996,144.9984,4019
1,2,2.5,3067,2,1,0.0,156,-37.8079,144.9934,4019
2,3,2.5,3067,3,2,0.0,134,-37.8093,144.9944,4019
3,3,2.5,3067,3,2,1.0,94,-37.7969,144.9969,4019
4,4,2.5,3067,3,1,2.0,120,-37.8072,144.9941,4019


In [14]:
# Concat X with the encoded good columns

new_X = pd.concat([new_X, encoded_sub], axis=1)

new_X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
0,2,2.5,3067,2,1,1.0,202,-37.7996,144.9984,4019,0,1,2
1,2,2.5,3067,2,1,0.0,156,-37.8079,144.9934,4019,0,1,2
2,3,2.5,3067,3,2,0.0,134,-37.8093,144.9944,4019,0,3,2
3,3,2.5,3067,3,2,1.0,94,-37.7969,144.9969,4019,0,0,2
4,4,2.5,3067,3,1,2.0,120,-37.8072,144.9941,4019,0,4,2


In [15]:
new_X.isnull().sum()

Rooms             0
Distance          0
Postcode          0
Bedroom2          0
Bathroom          0
Car              62
Landsize          0
Lattitude         0
Longtitude        0
Propertycount     0
Type              0
Method            0
Regionname        0
dtype: int64

In [16]:
# Impute the col with NaNs

imputer = SimpleImputer()

imp_X = pd.DataFrame(imputer.fit_transform(new_X))

imp_X.columns = new_X.columns

imp_X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
0,2.0,2.5,3067.0,2.0,1.0,1.0,202.0,-37.7996,144.9984,4019.0,0.0,1.0,2.0
1,2.0,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,144.9934,4019.0,0.0,1.0,2.0
2,3.0,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,144.9944,4019.0,0.0,3.0,2.0
3,3.0,2.5,3067.0,3.0,2.0,1.0,94.0,-37.7969,144.9969,4019.0,0.0,0.0,2.0
4,4.0,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,144.9941,4019.0,0.0,4.0,2.0


In [17]:
# Split the dataset

X_train, X_valid, y_train, y_valid = train_test_split(imp_X, y, test_size=0.2, random_state=12)

In [18]:
X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
168,3.0,13.8,3018.0,3.0,1.0,2.0,606.0,-37.8608,144.8124,5301.0,0.0,1.0,6.0
217,4.0,11.1,3025.0,4.0,1.0,4.0,633.0,-37.8375,144.8541,5132.0,0.0,3.0,6.0
5434,3.0,2.6,3121.0,3.0,1.0,1.0,167.0,-37.8167,145.0098,14949.0,0.0,1.0,2.0
5941,3.0,12.6,3020.0,3.0,1.0,3.0,534.0,-37.7801,144.8412,3755.0,0.0,3.0,6.0
6730,3.0,9.1,3040.0,3.0,1.0,1.0,321.0,-37.757,144.896,1543.0,0.0,3.0,6.0


In [19]:
# Define the model, train and evaluate it

model = DecisionTreeRegressor(random_state=0)

model.fit(X_train, y_train)

predictions = model.predict(X_valid)

mae = mean_absolute_error(predictions, y_valid)

mae

247238.6537800687

In [20]:
# Find the optimum maximum leaf nodes
# Define a function that returns a different mae depending on the maximum leaf nodes used
# Then use a for loop that loops through an array of different values for leaf nodes

# The function

def get_mae(max_leaf_nodes, X_train, X_valid, y_train, y_test):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=11)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    
    # Evaluate model's performance for every value
    mae = mean_absolute_error(predictions, y_valid)
    
    return mae

In [21]:
# For loop

for max_leaf_nodes in [170, 280, 350, 500, 750, 1000]:
    the_mae = get_mae(max_leaf_nodes, X_train, X_valid, y_train, y_valid)
    
    # Print the corresponding mean_absolute_error for each value
    
    print('The Max_leaf_nodes: {} \t\t\t\t  The mae: {}'.format(max_leaf_nodes, the_mae))

The Max_leaf_nodes: 170 				  The mae: 232415.11989543214
The Max_leaf_nodes: 280 				  The mae: 227359.98841071586
The Max_leaf_nodes: 350 				  The mae: 230870.84335908538
The Max_leaf_nodes: 500 				  The mae: 228339.32247648586
The Max_leaf_nodes: 750 				  The mae: 227300.5770724128
The Max_leaf_nodes: 1000 				  The mae: 231528.00998320745
