In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math

%load_ext autoreload
%autoreload 2

%matplotlib inline


In [2]:
# Read the dataset
df_raw = pd.read_csv( 'prices.csv', low_memory=False, parse_dates=[ 'Date' ] )

In [3]:
# First we replace 0 with NaNs, then we want to set landsize to the mean of their suburb
# Then drop any rows still Nan/infinite landsize
df_raw[ 'Landsize' ] = df_raw[ 'Landsize' ].replace( 0, np.nan )
df_raw[ 'Landsize' ] = df_raw[ 'Landsize' ].fillna( df_raw.groupby( 'Suburb' )[ 'Landsize' ].transform( 'mean' ) )
df_raw =  df_raw.dropna( subset=['Landsize' ] )

In [4]:
df_raw.drop( columns=[ 'Address', 'Method', 'SellerG', 'Propertycount', 'YearBuilt', 'CouncilArea', 'Regionname', 'Distance', 'Rooms' ], inplace=True )
df_raw

Unnamed: 0,Suburb,Type,Price,Date,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Lattitude,Longtitude
0,Abbotsford,h,,2016-03-09,3067.0,2.0,1.0,1.0,126.000000,,-37.80140,144.99580
1,Abbotsford,h,1480000.0,2016-03-12,3067.0,2.0,1.0,1.0,202.000000,,-37.79960,144.99840
2,Abbotsford,h,1035000.0,2016-04-02,3067.0,2.0,1.0,0.0,156.000000,79.0,-37.80790,144.99340
3,Abbotsford,u,,2016-04-02,3067.0,3.0,2.0,1.0,423.166667,,-37.81140,145.01160
4,Abbotsford,h,1465000.0,2017-04-03,3067.0,3.0,2.0,0.0,134.000000,150.0,-37.80930,144.99440
5,Abbotsford,h,850000.0,2017-04-03,3067.0,3.0,2.0,1.0,94.000000,,-37.79690,144.99690
6,Abbotsford,h,1600000.0,2016-04-06,3067.0,3.0,1.0,2.0,120.000000,142.0,-37.80720,144.99410
7,Abbotsford,h,,2016-06-08,3067.0,3.0,2.0,2.0,400.000000,220.0,-37.79650,144.99650
8,Abbotsford,h,,2016-06-08,3067.0,4.0,1.0,2.0,201.000000,,-37.79950,144.99740
9,Abbotsford,h,,2016-06-08,3067.0,3.0,2.0,1.0,202.000000,,-37.79960,144.99890


In [5]:
# Remove any rows with NaN values
df = df_raw.dropna( how='any', axis=0 ).copy( )

In [6]:
# Set year,month,day individual columns and remove the date column.
(df[ 'year' ], df[ 'month' ], df[ 'day' ] ) = (df.Date.dt.year, df.Date.dt.month, df.Date.dt.day)
df.drop( columns='Date', inplace=True )

In [7]:
# Convert strings to categories (because the model only accepts numbers not strings)
typeCat = dict([(category, code) for code, category in enumerate( df[ 'Type' ].astype( 'category' ).cat.categories ) ] )
suburbCat = dict([(category, code) for code, category in enumerate( df[ 'Suburb' ].astype( 'category' ).cat.categories ) ] )

df['Type'] = df[ 'Type' ].astype( 'category' ).cat.codes
df['Suburb' ] = df[ 'Suburb' ].astype( 'category' ).cat.codes
# df['CouncilArea' ] = df[ 'CouncilArea' ].astype( 'category' ).cat.codes
# df['Regionname' ] = df[ 'Regionname' ].astype( 'category' ).cat.codes

(typeCat, suburbCat)

({'h': 0, 't': 1, 'u': 2},
 {'Abbotsford': 0,
  'Aberfeldie': 1,
  'Airport West': 2,
  'Albanvale': 3,
  'Albert Park': 4,
  'Albion': 5,
  'Alphington': 6,
  'Altona': 7,
  'Altona Meadows': 8,
  'Altona North': 9,
  'Ardeer': 10,
  'Armadale': 11,
  'Ascot Vale': 12,
  'Ashburton': 13,
  'Ashwood': 14,
  'Aspendale': 15,
  'Aspendale Gardens': 16,
  'Attwood': 17,
  'Avondale Heights': 18,
  'Bacchus Marsh': 19,
  'Balaclava': 20,
  'Balwyn': 21,
  'Balwyn North': 22,
  'Bayswater': 23,
  'Bayswater North': 24,
  'Beaconsfield': 25,
  'Beaconsfield Upper': 26,
  'Beaumaris': 27,
  'Bellfield': 28,
  'Bentleigh': 29,
  'Bentleigh East': 30,
  'Berwick': 31,
  'Black Rock': 32,
  'Blackburn': 33,
  'Blackburn North': 34,
  'Blackburn South': 35,
  'Bonbeach': 36,
  'Boronia': 37,
  'Botanic Ridge': 38,
  'Box Hill': 39,
  'Braybrook': 40,
  'Briar Hill': 41,
  'Brighton': 42,
  'Brighton East': 43,
  'Broadmeadows': 44,
  'Brookfield': 45,
  'Brooklyn': 46,
  'Brunswick': 47,
  'Bruns

In [8]:
# Split into training and validation sets
df[ 'Price' ] = np.log( df[ 'Price' ] )
y = df.pop( 'Price' ).to_frame( )
x = df

In [9]:
x_train, x_test, y_train, y_test = train_test_split( x.index, y, test_size=0.2 )
# --
x_train = df.loc[ x_train ]
x_test = df.loc[ x_test ]
y_train = y_train.values.ravel( )
y_test = y_test.values.ravel( )

In [10]:
def rmse( x, y ): return math.sqrt( ( ( x - y ) ** 2 ).mean( ) )

def print_score(m):
    res = [rmse(m.predict(x_train), y_train), rmse(m.predict(x_test), y_test),
                m.score(x_train, y_train), m.score(x_test, y_test)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [11]:
# Create a RandomForestRegressor model to train
mdl = RandomForestRegressor( n_jobs=-1, min_samples_leaf=3, n_estimators=150, oob_score=True )
# mdl = linear_model.LinearRegression( )
%time mdl.fit( x_train, y_train )
# mdl.score( x_train, y_train )
print_score( mdl )

CPU times: user 6.22 s, sys: 41.2 ms, total: 6.26 s
Wall time: 1.05 s
[0.10686743218101069, 0.19746976447034767, 0.9595935974512536, 0.865094650448287, 0.8709846265950575]


In [12]:
# View the predictions

y_pred = mdl.predict( x_test )

In [13]:
# The correct price is
math.e ** y_test[ 1 ]

1649999.9999999981

In [14]:
# The predicted price is
math.e ** y_pred[ 1 ]

1298168.4151669494

In [15]:
# df_raw

In [16]:
type( y_test )

numpy.ndarray

In [17]:
y_test

array([13.71015004, 14.31628585, 14.771022  , ..., 13.39999511,
       13.31298374, 13.96393056])

In [18]:
x_test.iloc[ 0 ]

Suburb             9.00000
Type               0.00000
Postcode        3025.00000
Bedroom2           3.00000
Bathroom           1.00000
Car                2.00000
Landsize         672.00000
BuildingArea     110.00000
Lattitude        -37.82650
Longtitude       144.84386
year            2017.00000
month              1.00000
day                7.00000
Name: 14232, dtype: float64

In [22]:
zz=[9.00000,
0.00000,
3025.00000,
3.00000,
1.00000,
2.00000,
672.00000,
110.00000,
-37.82650,
144.84386,
2017.00000,
1.00000,
7.00000]

S_ARGS = [ 'Suburb', 'Type', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Lattitude', 'Longtitude', 'year', 'month', 'day' ]


In [24]:
data = [ 'one', 'two', 'three', 'three' ]
index = [ 'one', 'two', 'three', 'four']
z = pd.Series( zz, index=S_ARGS )
z

Suburb             9.00000
Type               0.00000
Postcode        3025.00000
Bedroom2           3.00000
Bathroom           1.00000
Car                2.00000
Landsize         672.00000
BuildingArea     110.00000
Lattitude        -37.82650
Longtitude       144.84386
year            2017.00000
month              1.00000
day                7.00000
dtype: float64

In [None]:
d = dict( enumerate( z.astype( 'category' ).cat.categories ) )

In [None]:
dict([(category, code) for code, category in enumerate(z.astype( 'category' ).cat.categories)])

In [28]:
suburbCat[ 'Altona' ]

7