<a href="https://colab.research.google.com/github/zerotodeeplearning/ztdl-masterclasses/blob/master/solutions_do_not_open/Gradient_Boosting_with_XGBoost_and_LightGBM_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Learn with us: www.zerotodeeplearning.com

Copyright © 2021: Zero to Deep Learning ® Catalit LLC.

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gradient Boosting with XGBoost and LightGBM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/zerotodeeplearning/ztdl-masterclasses/master/data/australian_credit.csv")

In [None]:
df.head()

Unnamed: 0,sex,age,time_at_addr,home_status,occupation,job_status,time_w_empl,investments,bank_account,time_w_bank,liability_ref,account_ref,monthly_housing,savings_balance,class
0,b,30.83,0.0,u,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,w,v,1.71,t,f,0,f,s,120.0,0,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653 entries, 0 to 652
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sex              653 non-null    object 
 1   age              653 non-null    float64
 2   time_at_addr     653 non-null    float64
 3   home_status      653 non-null    object 
 4   occupation       653 non-null    object 
 5   job_status       653 non-null    object 
 6   time_w_empl      653 non-null    float64
 7   investments      653 non-null    object 
 8   bank_account     653 non-null    object 
 9   time_w_bank      653 non-null    int64  
 10  liability_ref    653 non-null    object 
 11  account_ref      653 non-null    object 
 12  monthly_housing  653 non-null    float64
 13  savings_balance  653 non-null    int64  
 14  class            653 non-null    int64  
dtypes: float64(4), int64(3), object(8)
memory usage: 76.6+ KB


In [None]:
y = df.pop('class')
y.value_counts()

0    357
1    296
Name: class, dtype: int64

In [None]:
numerical_features = list(df.select_dtypes(include='number').columns)
numerical_features

['age',
 'time_at_addr',
 'time_w_empl',
 'time_w_bank',
 'monthly_housing',
 'savings_balance']

In [None]:
categorical_features = list(df.select_dtypes(exclude='number').columns)
categorical_features

['sex',
 'home_status',
 'occupation',
 'job_status',
 'investments',
 'bank_account',
 'liability_ref',
 'account_ref']

## Baselines

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
  df[numerical_features], y, test_size=0.2, random_state=0)

In [None]:
def train_eval(model):
  model.fit(X_train, y_train)

  train_score = model.score(X_train, y_train)
  test_score = model.score(X_test, y_test)
  return train_score, test_score

In [None]:
models = [DummyClassifier(strategy='most_frequent'),
          LogisticRegression(solver='liblinear'),
          DecisionTreeClassifier()]

res = []

for model in models:
  mname = model.__class__.__name__
  tr, te = train_eval(model)
  res.append([mname, tr, te])

df_results = pd.DataFrame(res, columns=['model_name',
                                        'train_accuracy',
                                        'test_accuracy'])

df_results.sort_values('test_accuracy', ascending=False)

Unnamed: 0,model_name,train_accuracy,test_accuracy
1,LogisticRegression,0.766284,0.801527
2,DecisionTreeClassifier,1.0,0.732824
0,DummyClassifier,0.547893,0.541985


## Exercise 1: Scikit-Learn

Extend the above measurements with the following models from Scikit Learn:

- Random Forest
- Extra Trees
- AdaBoost


In [None]:
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier

In [None]:
models = [RandomForestClassifier(),
          ExtraTreesClassifier(),
          AdaBoostClassifier()]

res = []

for model in models:
  mname = model.__class__.__name__
  tr, te = train_eval(model)
  res.append([mname, tr, te])

df_results = pd.concat([df_results,
                        pd.DataFrame(res,
                                     columns=['model_name',
                                              'train_accuracy',
                                              'test_accuracy'])],
                       axis=0).reset_index(drop=True)

df_results.sort_values('test_accuracy', ascending=False)

Unnamed: 0,model_name,train_accuracy,test_accuracy
1,LogisticRegression,0.766284,0.801527
3,RandomForestClassifier,1.0,0.78626
4,ExtraTreesClassifier,1.0,0.78626
5,AdaBoostClassifier,0.842912,0.778626
2,DecisionTreeClassifier,1.0,0.732824
0,DummyClassifier,0.547893,0.541985


## Exercise 2: XGBoost with 1-hot encoded variables

Let's use XGBoost to classify our data.

- Import `XGBClassifier` from `xgboost`
- create a new dataset called `df_one_hot` where all categorical variables are one-hot encoded
- perform a train/test split again
- re-train all the models previously trained on the new dataset
- include `XGBClassifier` in the list of models
- compare their scores
- BONUS: use `GridSearchCV` to optimize the hyperparameters of `XGBClassifier`

In [None]:
from xgboost import XGBClassifier

In [None]:
df_one_hot = pd.get_dummies(df)
df_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653 entries, 0 to 652
Data columns (total 43 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              653 non-null    float64
 1   time_at_addr     653 non-null    float64
 2   time_w_empl      653 non-null    float64
 3   time_w_bank      653 non-null    int64  
 4   monthly_housing  653 non-null    float64
 5   savings_balance  653 non-null    int64  
 6   sex_a            653 non-null    uint8  
 7   sex_b            653 non-null    uint8  
 8   home_status_l    653 non-null    uint8  
 9   home_status_u    653 non-null    uint8  
 10  home_status_y    653 non-null    uint8  
 11  occupation_aa    653 non-null    uint8  
 12  occupation_c     653 non-null    uint8  
 13  occupation_cc    653 non-null    uint8  
 14  occupation_d     653 non-null    uint8  
 15  occupation_e     653 non-null    uint8  
 16  occupation_ff    653 non-null    uint8  
 17  occupation_i    

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
  df_one_hot, y, test_size=0.2, random_state=0)

In [None]:
models = [DummyClassifier(strategy='most_frequent'),
          LogisticRegression(solver='liblinear'),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          ExtraTreesClassifier(),
          AdaBoostClassifier(),
          XGBClassifier()
          ]

res = []

for model in models:
  mname = model.__class__.__name__
  tr, te = train_eval(model)
  res.append([mname, tr, te])

df_results_2 = pd.DataFrame(res, columns=['model_name',
                                          'train_accuracy',
                                          'test_accuracy'])

df_results_2.sort_values('test_accuracy', ascending=False)

Unnamed: 0,model_name,train_accuracy,test_accuracy
1,LogisticRegression,0.877395,0.877863
4,ExtraTreesClassifier,1.0,0.877863
3,RandomForestClassifier,1.0,0.870229
6,XGBClassifier,0.950192,0.862595
5,AdaBoostClassifier,0.921456,0.847328
2,DecisionTreeClassifier,1.0,0.816794
0,DummyClassifier,0.547893,0.541985


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_dist = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6],
              "n_estimators": [200],
              "learning_rate": [0.05, 0.1,0.16],}

model = XGBClassifier()

grid_search = GridSearchCV(model,
                           param_grid=param_dist,
                           cv = 3, 
                           verbose=10, n_jobs=-1)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   12.6s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.05, 0.1, 0.16],
                         'max_depth': [10, 30, 50],
                         'min_child_weight': [1, 3, 6], 'n_estimat

In [None]:
model = grid_search.best_estimator_

In [None]:
tr, te = train_eval(model)

In [None]:
df_results_2.loc[df_results_2.index.max() + 1] = ['xgboost_optimized', tr, te]

In [None]:
df_results_2.sort_values('test_accuracy', ascending=False)

Unnamed: 0,model_name,train_accuracy,test_accuracy
7,xgboost_optimized,0.938697,0.885496
1,LogisticRegression,0.877395,0.877863
4,ExtraTreesClassifier,1.0,0.877863
3,RandomForestClassifier,1.0,0.870229
6,XGBClassifier,0.950192,0.862595
5,AdaBoostClassifier,0.921456,0.847328
2,DecisionTreeClassifier,1.0,0.816794
0,DummyClassifier,0.547893,0.541985


## Exercise 3: LightGBM

Let's use LightGBM to classify our data.

- import `LGBMClassifier` from `lightgbm`
- train your best model on the one-hot encoded features
- compare the results

- BONUS:
- create a new dataset called `df_cat_enc` where all categorical variables are encoded with the `OrdinalEncoder` from `sklearn.preprocessing`, while the numerical features are preserved
- perform a new train/test split
- train a lgbm model on this data. You will need to use the following code:
```python
ds_train = lgb.Dataset(X_train, label=y_train)
model3 = lgb.train(params, ds_train, 
                   categorical_feature = categorical_features)
```
refer to the [documentation](https://lightgbm.readthedocs.io/en/latest/Python-Intro.html) if you're unsure about how to proceed for this step.
- compare their scores


In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [None]:
model = lgb.LGBMClassifier(silent=False)

param_dist = {"max_depth": [25,50, 75],
              "learning_rate" : [0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [200]
             }
grid_search = GridSearchCV(model,
                           param_grid=param_dist,
                           cv = 3, 
                           verbose=10, n_jobs=-1)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:    9.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=False,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.05, 0.1],
                         'max_dept

In [None]:
params = grid_search.best_params_

In [None]:
ds_train = lgb.Dataset(X_train, label=y_train)

model2 = lgb.train(params, ds_train)



In [None]:
y_pred_train = model2.predict(X_train).round(0).astype(int)
y_pred_test = model2.predict(X_test).round(0).astype(int)

In [None]:
tr = accuracy_score(y_train, y_pred_train)
te = accuracy_score(y_test, y_pred_test)

In [None]:
df_results_2.loc[df_results_2.index.max() + 1] = ['lgboost_optimized', tr, te]

In [None]:
df_results_2.sort_values('test_accuracy', ascending=False)

Unnamed: 0,model_name,train_accuracy,test_accuracy
8,lgboost_optimized,0.913793,0.89313
7,xgboost_optimized,0.938697,0.885496
1,LogisticRegression,0.877395,0.877863
4,ExtraTreesClassifier,1.0,0.877863
3,RandomForestClassifier,1.0,0.870229
6,XGBClassifier,0.950192,0.862595
5,AdaBoostClassifier,0.921456,0.847328
2,DecisionTreeClassifier,1.0,0.816794
0,DummyClassifier,0.547893,0.541985


In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df_cat_enc = df.copy()

enc = OrdinalEncoder()

df_cat_enc[categorical_features] = enc.fit_transform(df[categorical_features])
df_cat_enc[categorical_features] = df_cat_enc[categorical_features].astype(int)

df_cat_enc.head()

Unnamed: 0,sex,age,time_at_addr,home_status,occupation,job_status,time_w_empl,investments,bank_account,time_w_bank,liability_ref,account_ref,monthly_housing,savings_balance
0,1,30.83,0.0,1,12,7,1.25,1,1,1,0,0,202.0,0
1,0,58.67,4.46,1,10,3,3.04,1,1,6,0,0,43.0,560
2,0,24.5,0.5,1,10,3,1.5,1,0,0,0,0,280.0,824
3,1,27.83,1.54,1,12,7,3.75,1,1,5,1,0,100.0,3
4,1,20.17,5.625,1,12,7,1.71,1,0,0,0,2,120.0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_cat_enc, y, test_size=0.2, random_state=0)

In [None]:
ds_train = lgb.Dataset(X_train, label=y_train)

In [None]:
model3 = lgb.train(params, ds_train, 
                   categorical_feature = categorical_features)

New categorical_feature is ['account_ref', 'bank_account', 'home_status', 'investments', 'job_status', 'liability_ref', 'occupation', 'sex']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [None]:
y_pred_train = model3.predict(X_train).round(0).astype(int)
y_pred_test = model3.predict(X_test).round(0).astype(int)

In [None]:
tr = accuracy_score(y_train, y_pred_train)
te = accuracy_score(y_test, y_pred_test)

In [None]:
df_results_2.loc[df_results_2.index.max() + 1] = ['lgboost_optimized_cat_enc', tr, te]

In [None]:
df_results_2.sort_values('test_accuracy', ascending=False)

Unnamed: 0,model_name,train_accuracy,test_accuracy
9,lgboost_optimized_cat_enc,0.917625,0.900763
8,lgboost_optimized,0.913793,0.89313
7,xgboost_optimized,0.938697,0.885496
1,LogisticRegression,0.877395,0.877863
4,ExtraTreesClassifier,1.0,0.877863
3,RandomForestClassifier,1.0,0.870229
6,XGBClassifier,0.950192,0.862595
5,AdaBoostClassifier,0.921456,0.847328
2,DecisionTreeClassifier,1.0,0.816794
0,DummyClassifier,0.547893,0.541985
