In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn import metrics

In [2]:
data = pd.read_csv('melb_data.csv')
data.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019


In [3]:
numerical_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64'] and  data[col].isnull().sum() == 0]
categorical_cols = [col for col in data.columns if data[col].dtype == 'object' and  data[col].isnull().sum() == 0]

In [4]:
my_cols = numerical_cols + categorical_cols
X = data[my_cols]
X.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Suburb,Address,Type,Method,SellerG,Date,Regionname
0,2,1480000,2.5,3067,2,1,202,-37.7996,144.9984,4019,Abbotsford,85 Turner St,h,S,Biggin,3/12/2016,Northern Metropolitan
1,2,1035000,2.5,3067,2,1,156,-37.8079,144.9934,4019,Abbotsford,25 Bloomburg St,h,S,Biggin,4/2/2016,Northern Metropolitan
2,3,1465000,2.5,3067,3,2,134,-37.8093,144.9944,4019,Abbotsford,5 Charles St,h,SP,Biggin,4/3/2017,Northern Metropolitan
3,3,850000,2.5,3067,3,2,94,-37.7969,144.9969,4019,Abbotsford,40 Federation La,h,PI,Biggin,4/3/2017,Northern Metropolitan
4,4,1600000,2.5,3067,3,1,120,-37.8072,144.9941,4019,Abbotsford,55a Park St,h,VB,Nelson,4/6/2016,Northern Metropolitan


In [5]:
X = X.drop(['Address', 'SellerG', 'Date'], axis=1)
X.head(3)

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Suburb,Type,Method,Regionname
0,2,1480000,2.5,3067,2,1,202,-37.7996,144.9984,4019,Abbotsford,h,S,Northern Metropolitan
1,2,1035000,2.5,3067,2,1,156,-37.8079,144.9934,4019,Abbotsford,h,S,Northern Metropolitan
2,3,1465000,2.5,3067,3,2,134,-37.8093,144.9944,4019,Abbotsford,h,SP,Northern Metropolitan


In [6]:
data.shape

(13580, 21)

In [7]:
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
cat_cols

['Suburb', 'Type', 'Method', 'Regionname']

In [8]:
encoder = LabelEncoder()

In [9]:
enc_X = X[cat_cols].apply(encoder.fit_transform)
enc_X.head(3)

Unnamed: 0,Suburb,Type,Method,Regionname
0,0,0,1,2
1,0,0,1,2
2,0,0,3,2


In [10]:
new_X = X.join(enc_X.add_suffix('_label'))
new_X.head(3)

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Suburb,Type,Method,Regionname,Suburb_label,Type_label,Method_label,Regionname_label
0,2,1480000,2.5,3067,2,1,202,-37.7996,144.9984,4019,Abbotsford,h,S,Northern Metropolitan,0,0,1,2
1,2,1035000,2.5,3067,2,1,156,-37.8079,144.9934,4019,Abbotsford,h,S,Northern Metropolitan,0,0,1,2
2,3,1465000,2.5,3067,3,2,134,-37.8093,144.9944,4019,Abbotsford,h,SP,Northern Metropolitan,0,0,3,2


In [11]:
valid_fraction = 0.1
valid_size = int(len(new_X) * valid_fraction)

train = new_X[:-2 * valid_size]
valid = new_X[-2 * valid_size: -valid_size]
test = new_X[-valid_size:]

train.shape

(10864, 18)

In [12]:
features = new_X.columns.drop(['Price', 'Suburb', 'Type', 'Method', 'Regionname'])
features

Index(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Landsize',
       'Lattitude', 'Longtitude', 'Propertycount', 'Suburb_label',
       'Type_label', 'Method_label', 'Regionname_label'],
      dtype='object')

In [13]:
param = {'num_leaves': 65, 'metric': 'mean_absolute_error'}
num_rounds = 1200

In [14]:
dtrain = lgb.Dataset(train[features], train['Price'])
dvalid = lgb.Dataset(valid[features], valid['Price'])

In [15]:
model_1 = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid], verbose_eval=False)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1571
[LightGBM] [Info] Number of data points in the train set: 10864, number of used features: 13
[LightGBM] [Info] Start training from score 1075778.112113
