In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

import category_encoders as ce

from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('melb_data.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
Suburb           13580 non-null object
Address          13580 non-null object
Rooms            13580 non-null int64
Type             13580 non-null object
Price            13580 non-null int64
Method           13580 non-null object
SellerG          13580 non-null object
Date             13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null int64
Bedroom2         13580 non-null int64
Bathroom         13580 non-null int64
Car              13518 non-null float64
Landsize         13580 non-null int64
BuildingArea     7130 non-null float64
YearBuilt        8205 non-null float64
CouncilArea      12211 non-null object
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null int64
dtypes: float64(6), int64(7), object(8)
memory usage: 2.2+ MB


In [4]:
data.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [5]:
cols = [col for col in data.columns if data[col].isnull().sum() < 100]
data = data[cols]
data.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,2,1,1.0,202,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,2,1,0.0,156,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,3,2,0.0,134,-37.8093,144.9944,Northern Metropolitan,4019


In [6]:
categ_cols = [col for col in data.columns if data[col].dtype == 'object']
data[categ_cols].nunique()

Suburb          314
Address       13378
Type              3
Method            5
SellerG         268
Date             58
Regionname        8
dtype: int64

In [7]:
cat_features = ['Type', 'Method', 'Regionname']
cat_data = data[cat_features]
cat_data.head()

Unnamed: 0,Type,Method,Regionname
0,h,S,Northern Metropolitan
1,h,S,Northern Metropolitan
2,h,SP,Northern Metropolitan
3,h,PI,Northern Metropolitan
4,h,VB,Northern Metropolitan


In [8]:
num_data = data.select_dtypes(exclude='object')
num_data.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount
0,2,1480000,2.5,3067,2,1,1.0,202,-37.7996,144.9984,4019
1,2,1035000,2.5,3067,2,1,0.0,156,-37.8079,144.9934,4019
2,3,1465000,2.5,3067,3,2,0.0,134,-37.8093,144.9944,4019
3,3,850000,2.5,3067,3,2,1.0,94,-37.7969,144.9969,4019
4,4,1600000,2.5,3067,3,1,2.0,120,-37.8072,144.9941,4019


In [9]:
new_data = num_data.join(cat_data)
new_data.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
0,2,1480000,2.5,3067,2,1,1.0,202,-37.7996,144.9984,4019,h,S,Northern Metropolitan
1,2,1035000,2.5,3067,2,1,0.0,156,-37.8079,144.9934,4019,h,S,Northern Metropolitan
2,3,1465000,2.5,3067,3,2,0.0,134,-37.8093,144.9944,4019,h,SP,Northern Metropolitan
3,3,850000,2.5,3067,3,2,1.0,94,-37.7969,144.9969,4019,h,PI,Northern Metropolitan
4,4,1600000,2.5,3067,3,1,2.0,120,-37.8072,144.9941,4019,h,VB,Northern Metropolitan


In [10]:
new_data.isnull().sum()

Rooms             0
Price             0
Distance          0
Postcode          0
Bedroom2          0
Bathroom          0
Car              62
Landsize          0
Lattitude         0
Longtitude        0
Propertycount     0
Type              0
Method            0
Regionname        0
dtype: int64

In [11]:
imp_data = new_data.fillna(0)
imp_data.isnull().sum()

Rooms            0
Price            0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
Lattitude        0
Longtitude       0
Propertycount    0
Type             0
Method           0
Regionname       0
dtype: int64

In [12]:
valid_fraction = 0.1

valid_size = int(len(imp_data) * valid_fraction)

train = imp_data[:-2 * valid_size]
valid = imp_data[-2 * valid_size : -valid_size]
test = imp_data[-valid_size:]
train.shape

(10864, 14)

In [13]:
valid.isnull().sum()

Rooms            0
Price            0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
Lattitude        0
Longtitude       0
Propertycount    0
Type             0
Method           0
Regionname       0
dtype: int64

In [14]:
test.shape

(1358, 14)

In [15]:
encoder = ce.CountEncoder(cols=cat_features)

In [16]:
encoder.fit(train[cat_features], train['Price'])

CountEncoder(cols=['Type', 'Method', 'Regionname'], combine_min_nan_groups=True,
             drop_invariant=False, handle_missing='count', handle_unknown=None,
             min_group_name=None, min_group_size=None, normalize=False,
             return_df=True, verbose=0)

In [17]:
enc_train = train.join(encoder.transform(train[cat_features]).add_suffix('_Count'))
enc_valid = valid.join(encoder.transform(valid[cat_features]).add_suffix('_Count'))
enc_test = test.join(encoder.transform(test[cat_features]).add_suffix('_Count'))

enc_train.head(3)

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname,Type_Count,Method_Count,Regionname_Count
0,2,1480000,2.5,3067,2,1,1.0,202,-37.7996,144.9984,4019,h,S,Northern Metropolitan,7246,7223,3202
1,2,1035000,2.5,3067,2,1,0.0,156,-37.8079,144.9934,4019,h,S,Northern Metropolitan,7246,7223,3202
2,3,1465000,2.5,3067,3,2,0.0,134,-37.8093,144.9944,4019,h,SP,Northern Metropolitan,7246,1355,3202


In [18]:
enc_valid.isnull().sum()

Rooms               0
Price               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
Lattitude           0
Longtitude          0
Propertycount       0
Type                0
Method              0
Regionname          0
Type_Count          0
Method_Count        0
Regionname_Count    0
dtype: int64

In [19]:
enc_valid.isnull().sum()

Rooms               0
Price               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
Lattitude           0
Longtitude          0
Propertycount       0
Type                0
Method              0
Regionname          0
Type_Count          0
Method_Count        0
Regionname_Count    0
dtype: int64

In [20]:
train = enc_train.drop(cat_features, axis=1)
valid = enc_valid.drop(cat_features, axis=1)
test = enc_test.drop(cat_features, axis=1)

train.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount,Type_Count,Method_Count,Regionname_Count
0,2,1480000,2.5,3067,2,1,1.0,202,-37.7996,144.9984,4019,7246,7223,3202
1,2,1035000,2.5,3067,2,1,0.0,156,-37.8079,144.9934,4019,7246,7223,3202
2,3,1465000,2.5,3067,3,2,0.0,134,-37.8093,144.9944,4019,7246,1355,3202
3,3,850000,2.5,3067,3,2,1.0,94,-37.7969,144.9969,4019,7246,1292,3202
4,4,1600000,2.5,3067,3,1,2.0,120,-37.8072,144.9941,4019,7246,937,3202


In [21]:
features = train.columns.drop('Price')

In [22]:
model = RandomForestRegressor(random_state=0)

In [23]:
model.fit(train[features], train['Price'])

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [24]:
predictions = model.predict(valid[features])

In [25]:
mae = mean_absolute_error(predictions, valid['Price'])
mae

147889.4469103724

In [26]:
import lightgbm as lgb

In [27]:
param = {'num_leaves': 67, 'metric': 'mean_absolute_error'}
num_rounds = 500

In [28]:
dtrain = lgb.Dataset(train[features], train['Price'])
dvalid = lgb.Dataset(valid[features], valid['Price'])

In [29]:
model_2 = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid], early_stopping_rounds=20)
# model_2.best_score

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1375
[LightGBM] [Info] Number of data points in the train set: 10864, number of used features: 13
[LightGBM] [Info] Start training from score 1075778.112113
[1]	valid_0's l1: 414308
Training until validation scores don't improve for 20 rounds
[2]	valid_0's l1: 385394
[3]	valid_0's l1: 359585
[4]	valid_0's l1: 337026
[5]	valid_0's l1: 318000
[6]	valid_0's l1: 300533
[7]	valid_0's l1: 283720
[8]	valid_0's l1: 270522
[9]	valid_0's l1: 258930
[10]	valid_0's l1: 247738
[11]	valid_0's l1: 236976
[12]	valid_0's l1: 228747
[13]	valid_0's l1: 220247
[14]	valid_0's l1: 213528
[15]	valid_0's l1: 207283
[16]	valid_0's l1: 201246
[17]	valid_0's l1: 197069
[18]	valid_0's l1: 192966
[19]	valid_0's l1: 189063
[20]	valid_0's l1: 186003
[21]	valid_0's l1: 183464
[22]	valid_0's l1: 179795
[23]	valid_0's l1: 177835
[24]	valid_0's l1: 176040
[25]	valid_0's l

In [30]:
print('The test scores for RandomForest')
test_predictions = model.predict(test[features])
test_mae = mean_absolute_error(test_predictions, test['Price'])
print('>>: ', test_mae)

The test scores for RandomForest
>>:  173504.38862788415


In [31]:
print('The test scores for LightGBM')
test_predictions = model_2.predict(test[features])
test_mae = mean_absolute_error(test_predictions, test['Price'])
print('>>: ', test_mae)

The test scores for LightGBM
>>:  170749.27983049338
