<a href="https://colab.research.google.com/github/chaeyeon2367/dataAnalysis-python-addata/blob/main/Ensemble%20Learning/Stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stacking

: It is also called "Meta learner", a technique that combines various models

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Import the dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv("/content/drive/MyDrive/Ensemble learning/otto_train.csv") # Product Category
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
'''
id: unique ID
feat_1 to feat_93: explanatory variable
Target: Target Variables (1 to 9)
'''

'\nid: unique ID\nfeat_1 to feat_93: explanatory variable\nTarget: Target Variables (1 to 9)\n'

In [5]:
nCar = data.shape[0] # number of data
nVar = data.shape[1] # number of variable
print('nCar: %d' % nCar, 'nVar: %d' % nVar )

nCar: 61878 nVar: 95


## Remove variables deemed meaningless

In [6]:
data = data.drop(['id'], axis = 1) # remove id

## Convert the string of the target variable to a number

In [7]:
mapping_dict = {"Class_1": 1,
                "Class_2": 2,
                "Class_3": 3,
                "Class_4": 4,
                "Class_5": 5,
                "Class_6": 6,
                "Class_7": 7,
                "Class_8": 8,
                "Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

## Split explanatory and target variables, split learning and evaluation data

In [8]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns] # explantory variable
y = after_mapping_target # target variable
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42) # split the ratio of learning data to evaluation data by 8:2
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # Check the number of data

(49502, 93) (12376, 93) (49502,) (12376,)


## 1. XGBoost

- gradient boosting + regularization
- prevent overfiting

In [9]:
!pip install xgboost



In [10]:
import xgboost as xgb
import time
start = time.time() # set the start time
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y) # Convert training data to fit XGBoost model
xgb_dtest = xgb.DMatrix(data = test_x) # Convert test data to fit XGBoost model
xgb_param = {'max_depth': 10, # the depth of tree
         'learning_rate': 0.01, # Step Size
         'n_estimators': 100, # Number of trees
         'objective': 'multi:softmax', # objecive function
        'num_class': len(set(train_y)) + 1} # add parameter, Label must be in [0, num_class) -> num_class보다 1 커야한다.
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain) # learning process
xgb_model_predict = xgb_model.predict(xgb_dtest) # test data prediction
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # calculate accuracy
print("Time: %.2f" % (time.time() - start), "seconds") # Calculation of Code Execution Time

Parameters: { "n_estimators" } are not used.



Accuracy: 76.66 %
Time: 15.29 seconds


In [11]:
xgb_model_predict

array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)

## 2. LightGBM

- binary regression : binary logistic or logistic regression

In [12]:
!pip install lightgbm



In [13]:
import lightgbm as lgb
start = time.time()
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # Convert training data to fit LightGBM model
lgb_param = {'max_depth': 10, #depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees
            'objective': 'multiclass', # objective function
            'num_class': len(set(train_y)) + 1} # add parameter, Label must be in [0, num_class) -> It must be 1 greater than num_class.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # learning process
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # predict test data, Predicts the largest value of the result of Softmax in Label
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # calculate accuracy
print("Time: %.2f" % (time.time() - start), "seconds") # Calculation of Code Execution Time



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.205157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Time: 37.59 seconds


In [14]:
lgb_model.predict(test_x)

array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,
        3.24234521e-02, 5.82126692e-02, 3.67722414e-02],
       [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,
        3.25081119e-01, 9.38028846e-02, 6.50463131e-02],
       [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,
        1.42318289e-02, 3.40230275e-02, 2.14919364e-02],
       ...,
       [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,
        5.46934960e-02, 7.24513712e-02, 5.74635996e-01],
       [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,
        2.45870954e-02, 5.65410617e-02, 3.62344513e-02],
       [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,
        5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])

## 3. Catboost

In [15]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [16]:
import catboost as cb
start = time.time()
cb_dtrain = cb.Pool(data = train_x, label = train_y) # Convert training data to fit Catboost model
cb_param = {'max_depth': 10, # depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees
            'eval_metric': 'Accuracy', # evaluation of metric
            'loss_function': 'MultiClass'} # loss fuction, objective function
cb_model = cb.train(pool = cb_dtrain, params = cb_param) # learning process
cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1 #Evaluate test data prediction, the largest label of Softmax's results, +1 to match the order of indexes
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict) * 100), "%") # caculate accuracy
print("Time: %.2f" % (time.time() - start), "seconds") # calculate execution time

0:	learn: 0.5907034	total: 3.74s	remaining: 6m 10s
1:	learn: 0.6356107	total: 8.1s	remaining: 6m 36s
2:	learn: 0.6411256	total: 10.7s	remaining: 5m 46s
3:	learn: 0.6480344	total: 13s	remaining: 5m 12s
4:	learn: 0.6508222	total: 15.6s	remaining: 4m 56s
5:	learn: 0.6499939	total: 19.2s	remaining: 5m 1s
6:	learn: 0.6507818	total: 26s	remaining: 5m 45s
7:	learn: 0.6548422	total: 29s	remaining: 5m 33s
8:	learn: 0.6559533	total: 31.7s	remaining: 5m 20s
9:	learn: 0.6560947	total: 34.6s	remaining: 5m 11s
10:	learn: 0.6568421	total: 36.6s	remaining: 4m 56s
11:	learn: 0.6588219	total: 40.2s	remaining: 4m 54s
12:	learn: 0.6592259	total: 43.7s	remaining: 4m 52s
13:	learn: 0.6611248	total: 45.7s	remaining: 4m 40s
14:	learn: 0.6625591	total: 46.7s	remaining: 4m 24s
15:	learn: 0.6631853	total: 47.7s	remaining: 4m 10s
16:	learn: 0.6639328	total: 48.7s	remaining: 3m 57s
17:	learn: 0.6668821	total: 49.7s	remaining: 3m 46s
18:	learn: 0.6669630	total: 50.6s	remaining: 3m 35s
19:	learn: 0.6675286	total: 51

In [17]:
cb_model.predict(test_x)

array([[-0.35426047,  1.22109587,  0.44230101, ..., -0.1698448 ,
        -0.02059177, -0.2130643 ],
       [-0.07235138,  0.42535181,  0.20060428, ...,  0.21863604,
         0.2719157 ,  0.25089315],
       [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,
        -0.24018767, -0.32984969],
       ...,
       [ 0.05304325,  0.02500267, -0.14752573, ..., -0.20741963,
         0.12789417,  1.51166757],
       [-0.55093666,  1.7691278 ,  0.99746884, ..., -0.3420542 ,
        -0.49799871, -0.38136323],
       [-0.3033724 ,  0.09352675, -0.11808658, ...,  0.65825036,
         1.05515787, -0.20799899]])

## Import dataset - house price dataset

In [32]:
# import dataset
data = pd.read_csv("/content/drive/MyDrive/Ensemble learning/kc_house_data.csv")
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [33]:
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1) # id, date, zipcode, lat, long remove

In [34]:
feature_columns = list(data.columns.difference(['price'])) # All rows except Price colum
X = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # The ratio of train data to test data is 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(15129, 8) (6484, 8) (15129,) (6484,)


## Ensemble of Ensemble

In [35]:
import lightgbm as lgb
start = time.time()
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # Convert training data to fit LightGBM model
lgb_param = {'max_depth': 10, #depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees
            'objective': 'regression'} # add parameter, Label must be in [0, num_class) -> It must be 1 greater than num_class
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # learning process




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537729.263666


In [36]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))

210904.17249451784

In [38]:
import random
bagging_predict_result = [] # create bin list
for _ in range(30):
    data_index = [data_index for data_index in range(train_x.shape[0])] # Convert the index of the learning data to a list
    random_data_index = np.random.choice(data_index, train_x.shape[0]) # Random sampling as much as 1/10th the size of the data, // to ignore the decimal point
    print(len(set(random_data_index)))
    lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # Convert training data to fit LightGBM model
    lgb_param = {'max_depth': 10, # depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees
            'objective': 'regression'} # add parameter, Label must be in [0, num_class) -> It must be 1 greater than num_class
    lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # learning process
    predict1 = lgb_model.predict(test_x) # prdict test data
    bagging_predict_result.append(predict1) # Save the result value to an empty list before the iteration runs

9559




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.157996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537509.363210
9534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 535648.812149




9586
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 235
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 534865.791658




9515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 230
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539142.813603




9487
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 229
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537785.704475




9488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000572 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536765.056514




9609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537161.988895




9560
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 235
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539179.565074




9594
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 541297.128495




9554
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536844.896622




9566




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 231
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 534841.135501
9541
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000989 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538922.173045




9517
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539048.196180




9522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538360.215348




9529
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538360.122612




9591
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 534996.971313




9613
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538877.429903




9579
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 231
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538177.533214




9510
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539213.572212




9567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 234
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 544006.210523




9626
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 234
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539258.026241




9642
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 231
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536164.172582




9524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 234
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539734.599709




9599
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 230
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539197.816049




9522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 231
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 533376.688281




9567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536648.774671




9652
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 540147.854650




9590




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 234
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538265.597330
9501
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 234
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 535857.940181




9592
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 542746.585432




In [39]:
bagging_predict_result

[array([512599.08758705, 598322.12883382, 946813.82945502, ...,
        337387.75778676, 932415.10447851, 473044.63932076]),
 array([489880.43845209, 682722.87489013, 917371.83201361, ...,
        341750.13115847, 944755.18007486, 471362.41002004]),
 array([518854.32495808, 626583.99340064, 948555.1777359 , ...,
        327559.34181381, 864015.00366211, 463030.40590377]),
 array([500018.77131601, 629177.74673737, 899376.86337203, ...,
        354152.32476082, 910769.51958086, 462423.71342844]),
 array([515918.01324145, 659464.58846516, 934146.91771972, ...,
        335141.08193343, 945775.85008374, 454840.72149468]),
 array([488500.22618356, 585777.09663709, 984543.28273783, ...,
        338790.92190519, 936847.57437988, 469941.14204566]),
 array([504664.36747129, 591910.04362272, 964815.86177932, ...,
        344398.28263172, 893999.01994994, 455135.76263881]),
 array([509083.46615529, 616074.37882989, 953179.2923477 , ...,
        321564.1808725 , 924764.87575067, 455925.67656199]),


In [40]:
# Calculate the average for the predicted result based on Bagging
bagging_predict = []
for lst2_index in range(test_x.shape[0]): # Repeat as many test data
    temp_predict = [] # Create a temporary bin list (save results in repeat statements)
    for lst_index in range(len(bagging_predict_result)): # Repeating Bagging Results List
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # Store the same index in the list among the predicted values of each Bagging result
    bagging_predict.append(np.mean(temp_predict)) # Add an average of 30 results for that index to the final list


In [41]:
# Calculate the average of the predicted results and evaluate the performance against the target variables of the actual test data

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 209726.95528929433


In [42]:
bagging_predict

[506504.4062505013,
 629257.4111471666,
 951450.429483833,
 1585280.0524766587,
 639637.5375683043,
 368434.82833934244,
 706204.9623602072,
 431284.0225396362,
 462014.2712649255,
 493459.4907652565,
 630906.8124588352,
 381702.54442719225,
 298452.58168299985,
 359096.21868304454,
 344301.7528210298,
 1304736.710504916,
 367010.0909092653,
 1001925.1771924167,
 314497.3406970689,
 527034.7163261533,
 377881.14684049337,
 1836075.9051210915,
 664169.433947043,
 541059.1964453536,
 510578.24996276456,
 483686.19574299944,
 295860.54114217986,
 249083.10633832504,
 472462.3020630078,
 539056.9944383901,
 490190.62589055835,
 473524.8020202073,
 463413.66200750443,
 581582.8254885883,
 377988.5651226484,
 1032259.1653789232,
 889350.676466804,
 529410.3228323862,
 357424.7447329129,
 1527653.3652813279,
 395235.33133952715,
 277534.6994603167,
 504964.9858223442,
 341412.22861615766,
 253454.26745686668,
 243772.92851489334,
 330641.06265385647,
 333785.35637646355,
 354481.13863910164,
