<a href="https://colab.research.google.com/github/chaeyeon2367/ml-python-studyML/blob/main/Ensemble%20Learning/Gradient_boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient boosting

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Import the dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv("/content/drive/MyDrive/Ensemble learning/otto_train.csv") # Product Category
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
'''
id: unique ID
feat_1 to feat_93: explanatory variable
Target: Target Variables (1 to 9)
'''

'\nid: unique ID\nfeat_1 to feat_93: explanatory variable\nTarget: Target Variables (1 to 9)\n'

In [5]:
nCar = data.shape[0] # number of data
nVar = data.shape[1] # number of variable
print('nCar: %d' % nCar, 'nVar: %d' % nVar )

nCar: 61878 nVar: 95


## Remove variables deemed meaningless

In [6]:
data = data.drop(['id'], axis = 1) # remove id

## Convert the string of the target variable to a number

In [7]:
mapping_dict = {"Class_1": 1,
                "Class_2": 2,
                "Class_3": 3,
                "Class_4": 4,
                "Class_5": 5,
                "Class_6": 6,
                "Class_7": 7,
                "Class_8": 8,
                "Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

## Split explanatory and target variables, split learning and evaluation data

In [8]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns] # explantory variable
y = after_mapping_target # target variable
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42) # split the ratio of learning data to evaluation data by 8:2
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # Check the number of data

(49502, 93) (12376, 93) (49502,) (12376,)


## 1. XGBoost

- gradient boosting + regularization
- prevent overfiting

In [None]:
!pip install xgboost

In [15]:
import xgboost as xgb
import time
start = time.time() # set the start time
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y) # Convert training data to fit XGBoost model
xgb_dtest = xgb.DMatrix(data = test_x) # Convert test data to fit XGBoost model
xgb_param = {'max_depth': 10, # the depth of tree
         'learning_rate': 0.01, # Step Size
         'n_estimators': 100, # Number of trees
         'objective': 'multi:softmax', # objecive function
        'num_class': len(set(train_y)) + 1} # add parameter, Label must be in [0, num_class) -> num_class보다 1 커야한다.
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain) # learning process
xgb_model_predict = xgb_model.predict(xgb_dtest) # test data prediction
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # calculate accuracy
print("Time: %.2f" % (time.time() - start), "seconds") # Calculation of Code Execution Time

Parameters: { "n_estimators" } are not used.



Accuracy: 76.66 %
Time: 4.33 seconds


In [12]:
xgb_model_predict

array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)

## 2. LightGBM

- binary regression : binary logistic or logistic regression

In [None]:
!pip install lightgbm

In [16]:
import lightgbm as lgb
start = time.time()
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # Convert training data to fit LightGBM model
lgb_param = {'max_depth': 10, #depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees
            'objective': 'multiclass', # objective function
            'num_class': len(set(train_y)) + 1} # add parameter, Label must be in [0, num_class) -> It must be 1 greater than num_class.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # learning process
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # predict test data, Predicts the largest value of the result of Softmax in Label
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # calculate accuracy
print("Time: %.2f" % (time.time() - start), "seconds") # Calculation of Code Execution Time



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.459754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Time: 9.83 seconds


In [17]:
lgb_model.predict(test_x)

array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,
        3.24234521e-02, 5.82126692e-02, 3.67722414e-02],
       [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,
        3.25081119e-01, 9.38028846e-02, 6.50463131e-02],
       [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,
        1.42318289e-02, 3.40230275e-02, 2.14919364e-02],
       ...,
       [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,
        5.46934960e-02, 7.24513712e-02, 5.74635996e-01],
       [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,
        2.45870954e-02, 5.65410617e-02, 3.62344513e-02],
       [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,
        5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])

## 3. Catboost

In [None]:
!pip install catboost

In [19]:
import catboost as cb
start = time.time()
cb_dtrain = cb.Pool(data = train_x, label = train_y) # Convert training data to fit Catboost model
cb_param = {'max_depth': 10, # depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees
            'eval_metric': 'Accuracy', # evaluation of metric
            'loss_function': 'MultiClass'} # loss fuction, objective function
cb_model = cb.train(pool = cb_dtrain, params = cb_param) # learning process
cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1 #Evaluate test data prediction, the largest label of Softmax's results, +1 to match the order of indexes
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict) * 100), "%") # caculate accuracy
print("Time: %.2f" % (time.time() - start), "seconds") # calculate execution time

0:	learn: 0.5907034	total: 2.29s	remaining: 3m 46s
1:	learn: 0.6356107	total: 4.71s	remaining: 3m 50s
2:	learn: 0.6411256	total: 6.63s	remaining: 3m 34s
3:	learn: 0.6480344	total: 8.28s	remaining: 3m 18s
4:	learn: 0.6508222	total: 10.4s	remaining: 3m 16s
5:	learn: 0.6499939	total: 12.1s	remaining: 3m 9s
6:	learn: 0.6507818	total: 14.3s	remaining: 3m 9s
7:	learn: 0.6548422	total: 18.1s	remaining: 3m 28s
8:	learn: 0.6559533	total: 21.4s	remaining: 3m 36s
9:	learn: 0.6560947	total: 23.6s	remaining: 3m 32s
10:	learn: 0.6568421	total: 24.6s	remaining: 3m 19s
11:	learn: 0.6588219	total: 25.6s	remaining: 3m 8s
12:	learn: 0.6592259	total: 26.6s	remaining: 2m 58s
13:	learn: 0.6611248	total: 27.7s	remaining: 2m 49s
14:	learn: 0.6625591	total: 28.7s	remaining: 2m 42s
15:	learn: 0.6631853	total: 29.7s	remaining: 2m 35s
16:	learn: 0.6639328	total: 30.7s	remaining: 2m 30s
17:	learn: 0.6668821	total: 31.9s	remaining: 2m 25s
18:	learn: 0.6669630	total: 33.9s	remaining: 2m 24s
19:	learn: 0.6675286	tota

In [20]:
cb_model.predict(test_x)

array([[-0.35426047,  1.22109587,  0.44230101, ..., -0.1698448 ,
        -0.02059177, -0.2130643 ],
       [-0.07235138,  0.42535181,  0.20060428, ...,  0.21863604,
         0.2719157 ,  0.25089315],
       [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,
        -0.24018767, -0.32984969],
       ...,
       [ 0.05304325,  0.02500267, -0.14752573, ..., -0.20741963,
         0.12789417,  1.51166757],
       [-0.55093666,  1.7691278 ,  0.99746884, ..., -0.3420542 ,
        -0.49799871, -0.38136323],
       [-0.3033724 ,  0.09352675, -0.11808658, ...,  0.65825036,
         1.05515787, -0.20799899]])

## Import dataset - house price dataset

In [21]:
# import dataset
data = pd.read_csv("/content/drive/MyDrive/Ensemble learning/kc_house_data.csv")
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [22]:
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1) # id, date, zipcode, lat, long remove

In [23]:
feature_columns = list(data.columns.difference(['price'])) # All rows except Price colum
X = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # The ratio of train data to test data is 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(15129, 8) (6484, 8) (15129,) (6484,)


In [24]:
import lightgbm as lgb
start = time.time()
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # Convert training data to fit LightGBM model
lgb_param = {'max_depth': 10, #depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees
            'objective': 'regression'} # add parameter, Label must be in [0, num_class) -> It must be 1 greater than num_class
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # learning process




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537729.263666


In [26]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))

210904.17249451784

In [27]:
import random
bagging_predict_result = [] # create bin list
for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])] # Convert the index of the learning data to a list
    random_data_index = np.random.choice(data_index, train_x.shape[0]) # Random sampling as much as 1/10th the size of the data, // to ignore the decimal point
    print(len(set(random_data_index)))
    lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # Convert training data to fit LightGBM model
    lgb_param = {'max_depth': 14, # depth of tree
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees
            'objective': 'regression'} # add parameter, Label must be in [0, num_class) -> It must be 1 greater than num_class
    lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # learning process
    predict1 = lgb_model.predict(test_x) # prdict test data
    bagging_predict_result.append(predict1) # Save the result value to an empty list before the iteration runs

9606
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 535591.086060




9567




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.156954 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537178.682200
9475




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.173171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 535214.237094
9575
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536712.849296




9571
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 238
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 540199.788486




9649
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000571 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 534242.655562




9556
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 538659.945667




9504
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 232
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536254.359178




9505
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 235
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 541283.946262




9521
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 230
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 539445.164320




In [28]:
bagging_predict_result

[array([481256.09364511, 698622.74097236, 937744.69916367, ...,
        330196.85304482, 806193.9529276 , 462507.0222844 ]),
 array([513680.09103305, 636185.27665716, 933827.13532708, ...,
        332064.48544174, 816905.25964684, 453514.50679987]),
 array([ 518946.71664541,  566856.40325719, 1003213.00457377, ...,
         342450.34826344,  918130.32160994,  464166.28882992]),
 array([502187.94854045, 661397.30200239, 939049.30926945, ...,
        345150.13616672, 911073.89064885, 471055.62376195]),
 array([478787.58834974, 621738.07095459, 902724.20175955, ...,
        321558.29867406, 911777.97027986, 459971.1305667 ]),
 array([528545.59758211, 616552.64854075, 883768.35837554, ...,
        334502.33352969, 908907.41548394, 457933.26553628]),
 array([488636.34015872, 601175.27037056, 969950.23819734, ...,
        337134.87578151, 956082.389374  , 464291.97630294]),
 array([536283.3312011 , 677909.18386165, 891368.90118702, ...,
        341864.31114879, 977265.18202999, 456846.571756

In [29]:
# Calculate the average for the predicted result based on Bagging
bagging_predict = []
for lst2_index in range(test_x.shape[0]): # Repeat as many test data
    temp_predict = [] # Create a temporary blank list (save results in repeat statements)
    for lst_index in range(len(bagging_predict_result)): # Repeating Bagging Results List
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # Store the same index in the list among the predicted values of each Bagging result
    bagging_predict.append(np.mean(temp_predict)) # Add an average of 30 results for that index to the final list

In [30]:
# Calculate the average of the predicted results and evaluate the performance against the target variables of the actual test data

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 210910.00987864519


In [31]:
bagging_predict

[502325.7010446523,
 629327.8653541065,
 934610.0491020044,
 1653638.3117346792,
 634890.1673206199,
 367554.48503556155,
 707762.0492452665,
 435932.90562294016,
 461661.6615464487,
 494165.053140305,
 642852.9149692468,
 380817.45477281354,
 298749.96202773956,
 358309.6616497707,
 338103.28435657127,
 1321182.569368287,
 363267.7964505035,
 1048060.2606606453,
 316681.40213170706,
 527124.1229172094,
 372936.84632696735,
 1998282.7747253336,
 658303.8238451525,
 542824.1065522383,
 506655.2329568858,
 487897.6311343868,
 296322.9654284023,
 247793.4758009413,
 471535.44292883726,
 534689.2247367535,
 489682.35607386206,
 467311.6760110544,
 466857.56954575574,
 556511.365153546,
 381724.01973561186,
 1040141.7162874384,
 935510.9063137674,
 528762.2033225338,
 356230.088618002,
 1513816.9750454961,
 399335.4168884832,
 276408.47695840674,
 503482.4209741607,
 338670.75689552433,
 257309.2039823844,
 239577.12146563834,
 331388.6035010704,
 332550.9922995052,
 354353.5689902381,
 557