In [70]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
FILE_PATH = "/home/neo/Documents/Projects/melbourne housing/"

In [3]:
df_melbourne = pd.read_csv(FILE_PATH + "Melbourne_housing_FULL.csv")

In [8]:
df_melbourne.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,2.0,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,3.0,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [10]:
print("number of training examples: {}".format(len(df_melbourne)))
df_melbourne = df_melbourne.dropna(axis=0)
print("number of training examples after removing na: {}".format(len(df_melbourne)))

number of training examples: 34857
number of training examples after removing na: 8887


In [14]:
y = df_melbourne.Price

In [64]:
features = ['Rooms','Distance','Bedroom2',
           'Bathroom','Car','Landsize','BuildingArea',
            'YearBuilt','Lattitude','Longtitude']
X = df_melbourne[features]

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

## Build a quick model - Linear regression

In [71]:
# Create linear regression object
regr = RandomForestRegressor()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))



Mean squared error: 80437181617.35
Coefficient of determination: 0.80


## Model Validation

In [72]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [73]:
regr = RandomForestRegressor()
regr.fit(X_train_scaled, y_train)
y_pred = regr.predict(X_test_scaled)

print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))



Mean squared error: 75238807519.77
Coefficient of determination: 0.81


In [75]:
from sklearn.metrics import mean_absolute_error

def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds_val)
    return(mae)

# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))



Max leaf nodes: 5  		 Mean Absolute Error:  341385
Max leaf nodes: 50  		 Mean Absolute Error:  208175




Max leaf nodes: 500  		 Mean Absolute Error:  173734




Max leaf nodes: 5000  		 Mean Absolute Error:  171641


This is an improvement over the deision tree model. Lets try to use the features now which we ignored earlier because of missing values

In [None]:
df_melbourne = pd.read_csv(FILE_PATH + "Melbourne_housing_FULL.csv")