In [1]:
import pandas as pd

melbourne_data = pd.read_csv('melb_data.csv')

melbourne_data.columns
melbourne_data.shape

(13580, 21)

In [2]:
# Data Cleaning

# (axis=0 drops rows which contain mising values)
melbourne_data = melbourne_data.dropna(axis=0)
melbourne_data.shape

(6196, 21)

In [3]:
# Select target data

target = melbourne_data['Price']

In [6]:
# Select input data

features = ['Rooms','Bathroom','Landsize','Lattitude', 'Longtitude']

inputs = melbourne_data[features]
inputs.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [7]:
inputs.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


# Building the model

In [8]:
from sklearn.tree import DecisionTreeRegressor

melb_model = DecisionTreeRegressor(random_state=1)

In [9]:
# Training the model

melb_model.fit(inputs, target)

DecisionTreeRegressor(random_state=1)

In [13]:
# Making predictions

print("Making predictions for the following 5 houses:")
print(inputs.head())
print("Predicted prices are:")
print(melb_model.predict(inputs.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
Predicted prices are:
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [14]:
target.head()

1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64

# Model validation

In [15]:
from sklearn.metrics import mean_absolute_error

In [16]:
predicted_home_prices = melb_model.predict(inputs)

In [17]:
mean_absolute_error(target, predicted_home_prices)

1115.7467183128902

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(inputs, target, random_state=0)

In [20]:
melbourne_model = DecisionTreeRegressor()

melbourne_model.fit(x_train, y_train)

DecisionTreeRegressor()

In [21]:
val_predictions = melbourne_model.predict(x_test)

In [22]:
# Bad model

mean_absolute_error(y_test, val_predictions)

273143.3839035937

# Random Forests

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
forest_model = RandomForestRegressor(random_state=0)

In [25]:
forest_model.fit(x_train, y_train)

RandomForestRegressor(random_state=0)

In [26]:
melb_preds = forest_model.predict(x_test)

In [28]:
mean_absolute_error(y_test, melb_preds)

206868.39967967046