# Home Prices in Melbourne

In [1]:
import pandas as pd

In [4]:
inputFile = "D:/Kaggle/melb_data.csv"
inputData = pd.read_csv(inputFile)
inputData.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


## Find the Home with the maximum building area

In [8]:
homeWithLargestArea = inputData.loc[inputData["BuildingArea"].idxmax()]
homeWithLargestArea

Suburb                New Gisborne
Address             71 Hamilton Rd
Rooms                            5
Type                             h
Price                    1.355e+06
Method                           S
SellerG                      Raine
Date                    23/09/2017
Distance                      48.1
Postcode                      3438
Bedroom2                         5
Bathroom                         3
Car                              5
Landsize                     44500
BuildingArea                 44515
YearBuilt                      NaN
CouncilArea                    NaN
Lattitude                 -37.4539
Longtitude                 144.589
Regionname       Northern Victoria
Propertycount                  849
Name: 13245, dtype: object

## Newest Home

In [31]:
newestHome = inputData.loc[inputData["YearBuilt"].idxmax()]
yearAsString = str(int(newestHome["YearBuilt"]))
pd.to_datetime("today")-pd.to_datetime(yearAsString, format="%Y")

Timedelta('299 days 00:00:00')

## Show columns

In [32]:
inputData.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

## Clean out not available records

In [33]:
cleanedData = inputData.dropna(axis=0)

## Set the prediction target

In [34]:
predictionTarget = cleanedData.Price

## Set the input features

In [39]:
inputFeatures = ["Rooms", "Bathroom", "Landsize", "Lattitude", "Longtitude"]
input = cleanedData[inputFeatures]
input.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


## Define model. Specify a number for random_state to ensure same results each run


In [40]:
from sklearn.tree import DecisionTreeRegressor
melbourne_model = DecisionTreeRegressor(random_state=1)

## Fit the model

In [41]:
melbourne_model.fit(input, predictionTarget)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

## Make predictions

In [42]:
print("Making predictions for the following 5 houses:")
print(input.head())
print("The predictions are")
print(melbourne_model.predict(input.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


## Calculate the mean absolute error

In [43]:
from sklearn.metrics import mean_absolute_error
predicted_home_prices = melbourne_model.predict(input)
mean_absolute_error(predictionTarget, predicted_home_prices)

1115.7467183128902

## Split data into training and validation data

In [46]:
from sklearn.model_selection import train_test_split
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_input, validation_input, train_prediction, validation_prediction = train_test_split(input, predictionTarget, random_state=0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_input, train_prediction)

# get predicted prices on validation data
validated_predictions = melbourne_model.predict(validation_input)
print(mean_absolute_error(validation_prediction, validated_predictions))

273439.7015278674


## Experimenting With Different Models
## Avoid overfitting and underfitting

In [49]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
def get_mean_absolute_error(max_leaf_nodes, train_input, validation_input, train_prediction, validation_prediction):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_input, train_prediction)
    validated_predictions = model.predict(validation_input)
    mae = mean_absolute_error(validation_prediction, validated_predictions)
    return(mae)

for max_leaf_nodes in [5, 50, 500, 5000]:
    calculated_mean_absolute_error = get_mean_absolute_error(max_leaf_nodes, train_input, validation_input, train_prediction, validation_prediction)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, calculated_mean_absolute_error))

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 50  		 Mean Absolute Error:  279794
Max leaf nodes: 500  		 Mean Absolute Error:  261769
Max leaf nodes: 5000  		 Mean Absolute Error:  272464


## Using many decision trees from Random Forrest

In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_input, train_prediction)
forrest_validated_predictions = forest_model.predict(validation_input)
print(mean_absolute_error(validation_prediction, forrest_validated_predictions))

218416.94207015279
