In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn import metrics

In [2]:
data = pd.read_csv('melb_data.csv')
data.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019


In [3]:
numerical_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64'] and  data[col].isnull().sum() == 0]
categorical_cols = [col for col in data.columns if data[col].dtype == 'object' and  data[col].isnull().sum() == 0]

In [4]:
my_cols = numerical_cols + categorical_cols
X = data[my_cols]
X.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Suburb,Address,Type,Method,SellerG,Date,Regionname
0,2,1480000,2.5,3067,2,1,202,-37.7996,144.9984,4019,Abbotsford,85 Turner St,h,S,Biggin,3/12/2016,Northern Metropolitan
1,2,1035000,2.5,3067,2,1,156,-37.8079,144.9934,4019,Abbotsford,25 Bloomburg St,h,S,Biggin,4/2/2016,Northern Metropolitan
2,3,1465000,2.5,3067,3,2,134,-37.8093,144.9944,4019,Abbotsford,5 Charles St,h,SP,Biggin,4/3/2017,Northern Metropolitan
3,3,850000,2.5,3067,3,2,94,-37.7969,144.9969,4019,Abbotsford,40 Federation La,h,PI,Biggin,4/3/2017,Northern Metropolitan
4,4,1600000,2.5,3067,3,1,120,-37.8072,144.9941,4019,Abbotsford,55a Park St,h,VB,Nelson,4/6/2016,Northern Metropolitan


In [5]:
X = X.drop(['Address', 'SellerG', 'Date'], axis=1)
X.head(3)

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Suburb,Type,Method,Regionname
0,2,1480000,2.5,3067,2,1,202,-37.7996,144.9984,4019,Abbotsford,h,S,Northern Metropolitan
1,2,1035000,2.5,3067,2,1,156,-37.8079,144.9934,4019,Abbotsford,h,S,Northern Metropolitan
2,3,1465000,2.5,3067,3,2,134,-37.8093,144.9944,4019,Abbotsford,h,SP,Northern Metropolitan


In [6]:
data.shape

(13580, 21)

In [7]:
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
cat_cols

['Suburb', 'Type', 'Method', 'Regionname']

In [8]:
encoder = LabelEncoder()

In [9]:
enc_X = X[cat_cols].apply(encoder.fit_transform)
enc_X.head(3)

Unnamed: 0,Suburb,Type,Method,Regionname
0,0,0,1,2
1,0,0,1,2
2,0,0,3,2


In [10]:
new_X = X.join(enc_X.add_suffix('_label'))
new_X.head(3)

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Suburb,Type,Method,Regionname,Suburb_label,Type_label,Method_label,Regionname_label
0,2,1480000,2.5,3067,2,1,202,-37.7996,144.9984,4019,Abbotsford,h,S,Northern Metropolitan,0,0,1,2
1,2,1035000,2.5,3067,2,1,156,-37.8079,144.9934,4019,Abbotsford,h,S,Northern Metropolitan,0,0,1,2
2,3,1465000,2.5,3067,3,2,134,-37.8093,144.9944,4019,Abbotsford,h,SP,Northern Metropolitan,0,0,3,2


In [11]:
valid_fraction = 0.1
valid_size = int(len(new_X) * valid_fraction)

train = new_X[:-2 * valid_size]
valid = new_X[-2 * valid_size: -valid_size]
test = new_X[-valid_size:]

train.shape

(10864, 18)

In [12]:
features = new_X.columns.drop(['Price', 'Suburb', 'Type', 'Method', 'Regionname'])
features

Index(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Landsize',
       'Lattitude', 'Longtitude', 'Propertycount', 'Suburb_label',
       'Type_label', 'Method_label', 'Regionname_label'],
      dtype='object')

In [13]:
param = {'num_leaves': 65, 'metric': 'mean_absolute_error'}
num_rounds = 1200

In [14]:
dtrain = lgb.Dataset(train[features], train['Price'])
dvalid = lgb.Dataset(valid[features], valid['Price'])

In [15]:
model_1 = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid])
model_1.best_iteration

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1571
[LightGBM] [Info] Number of data points in the train set: 10864, number of used features: 13
[LightGBM] [Info] Start training from score 1075778.112113
[1]	valid_0's l1: 415458
[2]	valid_0's l1: 387365
[3]	valid_0's l1: 361082
[4]	valid_0's l1: 339497
[5]	valid_0's l1: 318183
[6]	valid_0's l1: 300124
[7]	valid_0's l1: 283647
[8]	valid_0's l1: 270435
[9]	valid_0's l1: 257573
[10]	valid_0's l1: 247534
[11]	valid_0's l1: 237206
[12]	valid_0's l1: 228711
[13]	valid_0's l1: 219774
[14]	valid_0's l1: 213346
[15]	valid_0's l1: 207440
[16]	valid_0's l1: 202069
[17]	valid_0's l1: 197593
[18]	valid_0's l1: 193337
[19]	valid_0's l1: 188751
[20]	valid_0's l1: 185957
[21]	valid_0's l1: 183088
[22]	valid_0's l1: 180454
[23]	valid_0's l1: 177721
[24]	valid_0's l1: 175511
[25]	valid_0's l1: 173646
[26]	valid_0's l1: 171844
[27]	valid_0's l1: 169799

[330]	valid_0's l1: 146625
[331]	valid_0's l1: 146685
[332]	valid_0's l1: 146732
[333]	valid_0's l1: 146842
[334]	valid_0's l1: 146781
[335]	valid_0's l1: 146765
[336]	valid_0's l1: 146733
[337]	valid_0's l1: 146659
[338]	valid_0's l1: 146690
[339]	valid_0's l1: 146647
[340]	valid_0's l1: 146674
[341]	valid_0's l1: 146677
[342]	valid_0's l1: 146741
[343]	valid_0's l1: 146711
[344]	valid_0's l1: 146759
[345]	valid_0's l1: 146849
[346]	valid_0's l1: 146900
[347]	valid_0's l1: 146881
[348]	valid_0's l1: 146857
[349]	valid_0's l1: 146821
[350]	valid_0's l1: 146824
[351]	valid_0's l1: 146802
[352]	valid_0's l1: 146757
[353]	valid_0's l1: 146751
[354]	valid_0's l1: 146749
[355]	valid_0's l1: 146707
[356]	valid_0's l1: 146674
[357]	valid_0's l1: 146597
[358]	valid_0's l1: 146637
[359]	valid_0's l1: 146646
[360]	valid_0's l1: 146676
[361]	valid_0's l1: 146723
[362]	valid_0's l1: 146725
[363]	valid_0's l1: 146709
[364]	valid_0's l1: 146746
[365]	valid_0's l1: 146673
[366]	valid_0's l1: 146718
[

[700]	valid_0's l1: 149308
[701]	valid_0's l1: 149306
[702]	valid_0's l1: 149269
[703]	valid_0's l1: 149222
[704]	valid_0's l1: 149190
[705]	valid_0's l1: 149167
[706]	valid_0's l1: 149190
[707]	valid_0's l1: 149188
[708]	valid_0's l1: 149200
[709]	valid_0's l1: 149170
[710]	valid_0's l1: 149171
[711]	valid_0's l1: 149136
[712]	valid_0's l1: 149153
[713]	valid_0's l1: 149171
[714]	valid_0's l1: 149187
[715]	valid_0's l1: 149196
[716]	valid_0's l1: 149208
[717]	valid_0's l1: 149217
[718]	valid_0's l1: 149251
[719]	valid_0's l1: 149229
[720]	valid_0's l1: 149243
[721]	valid_0's l1: 149267
[722]	valid_0's l1: 149284
[723]	valid_0's l1: 149311
[724]	valid_0's l1: 149286
[725]	valid_0's l1: 149270
[726]	valid_0's l1: 149286
[727]	valid_0's l1: 149302
[728]	valid_0's l1: 149316
[729]	valid_0's l1: 149321
[730]	valid_0's l1: 149303
[731]	valid_0's l1: 149323
[732]	valid_0's l1: 149378
[733]	valid_0's l1: 149377
[734]	valid_0's l1: 149387
[735]	valid_0's l1: 149388
[736]	valid_0's l1: 149418
[

[1070]	valid_0's l1: 151475
[1071]	valid_0's l1: 151469
[1072]	valid_0's l1: 151484
[1073]	valid_0's l1: 151508
[1074]	valid_0's l1: 151501
[1075]	valid_0's l1: 151501
[1076]	valid_0's l1: 151487
[1077]	valid_0's l1: 151473
[1078]	valid_0's l1: 151473
[1079]	valid_0's l1: 151475
[1080]	valid_0's l1: 151497
[1081]	valid_0's l1: 151495
[1082]	valid_0's l1: 151507
[1083]	valid_0's l1: 151514
[1084]	valid_0's l1: 151529
[1085]	valid_0's l1: 151509
[1086]	valid_0's l1: 151496
[1087]	valid_0's l1: 151514
[1088]	valid_0's l1: 151531
[1089]	valid_0's l1: 151542
[1090]	valid_0's l1: 151560
[1091]	valid_0's l1: 151549
[1092]	valid_0's l1: 151559
[1093]	valid_0's l1: 151573
[1094]	valid_0's l1: 151587
[1095]	valid_0's l1: 151599
[1096]	valid_0's l1: 151612
[1097]	valid_0's l1: 151653
[1098]	valid_0's l1: 151665
[1099]	valid_0's l1: 151671
[1100]	valid_0's l1: 151665
[1101]	valid_0's l1: 151676
[1102]	valid_0's l1: 151671
[1103]	valid_0's l1: 151666
[1104]	valid_0's l1: 151654
[1105]	valid_0's l1:

0