In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math

%load_ext autoreload
%autoreload 2

%matplotlib inline


In [2]:
# Read the dataset
df_raw = pd.read_csv( 'prices.csv', low_memory=False, parse_dates=[ 'Date' ] )

In [3]:
# First we replace 0 with NaNs, then we want to set landsize to the mean of their suburb
# Then drop any rows still Nan/infinite landsize
df_raw[ 'Landsize' ] = df_raw[ 'Landsize' ].replace( 0, np.nan )
df_raw[ 'Landsize' ] = df_raw[ 'Landsize' ].fillna( df_raw.groupby( 'Suburb' )[ 'Landsize' ].transform( 'mean' ) )
df_raw =  df_raw.dropna( subset=['Landsize' ] )

In [4]:
df_raw.drop( columns=[ 'Address', 'Method', 'SellerG', 'Propertycount'], inplace=True )

In [5]:
# Remove any rows with NaN values
df = df_raw.dropna( how='any', axis=0 ).copy( )

In [6]:
# Set year,month,day individual columns and remove the date column.
(df[ 'year' ], df[ 'month' ], df[ 'day' ] ) = (df.Date.dt.year, df.Date.dt.month, df.Date.dt.day)
df.drop( columns='Date', inplace=True )

In [7]:
# Convert strings to categories (because the model only accepts numbers not strings)

df['Type'] = df[ 'Type' ].astype( 'category' ).cat.codes
df['Suburb' ] = df[ 'Suburb' ].astype( 'category' ).cat.codes
df['CouncilArea' ] = df[ 'CouncilArea' ].astype( 'category' ).cat.codes
df['Regionname' ] = df[ 'Regionname' ].astype( 'category' ).cat.codes

In [8]:
# Split into training and validation sets
df[ 'Price' ] = np.log( df[ 'Price' ] )
y = df.pop( 'Price' ).to_frame( )
x = df

In [9]:
x_train, x_test, y_train, y_test = train_test_split( x.index, y, test_size=0.2 )
# --
x_train = df.loc[ x_train ]
x_test = df.loc[ x_test ]
y_train = y_train.values.ravel( )
y_test = y_test.values.ravel( )

In [10]:
def rmse( x, y ): return math.sqrt( ( ( x - y ) ** 2 ).mean( ) )

def print_score(m):
    res = [rmse(m.predict(x_train), y_train), rmse(m.predict(x_test), y_test),
                m.score(x_train, y_train), m.score(x_test, y_test)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [12]:
# Create a RandomForestRegressor model to train
mdl = RandomForestRegressor( n_jobs=-1, min_samples_leaf=3, n_estimators=150, oob_score=True )
# mdl = linear_model.LinearRegression( )
%time mdl.fit( x_train, y_train )
# mdl.score( x_train, y_train )
print_score( mdl )

CPU times: user 7.16 s, sys: 8.62 ms, total: 7.17 s
Wall time: 1.13 s
[0.10697462884387782, 0.17864988594059672, 0.9599445807197918, 0.8863113255558459, 0.8705848633945432]


In [15]:
# View the predictions

y_pred = mdl.predict( x_test )

In [19]:
# The correct price is
math.e ** y_test[ 1 ]

2204999.999999998

In [20]:
# The predicted price is
math.e ** y_pred[ 1 ]

2023300.6638451174

In [18]:
# df_raw