# Exercícios - comparando LightGBM e XGBoost

In [1]:
# bibliotecas
import numpy as np 
import pandas as pd 
import lightgbm as lgbm
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# dataset
df_census = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)
df_census.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                  'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   'income']
df_census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
# EDA
df_census = df_census.drop(['education'], axis=1)
df_census = pd.get_dummies(df_census)
df_census = df_census.drop('income_ <=50K', axis=1)
df_census.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#features e labels
X = df_census.iloc[:,:-1]
y = df_census.iloc[:,-1]

### ToDo 1
Separe o conjunto de features com suas respectivas labels em conjunto de treino e teste na proporção de 70%-30%. Use random_state=42 para reprodutibilidade.

In [6]:
#resposta
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### ToDo 2

Treine um modelo LightGBM e um modelo XGBoost nos dados acima gerados. Utilize a versão padrão de ambos os modelos. Use a biblioteca datetime para computar o tempo de treinamento de cada modelo. Avalie ambos usando a acurácia. 

In [7]:
#resposta - XGBoost
model = XGBClassifier(random_state=42,verbosity=0)

start = datetime.now() 
model.fit(X_train, y_train)
stop = datetime.now()

y_pred = model.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))
print('Tempo de treinamento: ' + str(stop-start))

Score: 0.8742169266674855
Tempo de treinamento: 0:00:01.852612


In [8]:
#resposta - LightGBM
model_lgb = LGBMClassifier(random_state=42, verbosity=0)

start = datetime.now() 
model_lgb.fit(X_train, y_train)
stop = datetime.now()

y_pred = model_lgb.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))
print('Tempo de treinamento: ' + str(stop-start))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Score: 0.8788846579044344
Tempo de treinamento: 0:00:00.161871


### ToDo 3

Modifique a função grid_search, vista na aula de XGBoost, para otimizar os parametros de ambos os modelos. Agora, ela precisa permitir receber, além do conjunto de treino e respectivas respostas, o conjunto de teste e respectivas respostas.  

In [9]:
# resposta
def grid_search(params, random=False, X_train=None, y_train=None, X_test=None, y_test=None, model=XGBClassifier(random_state=42,verbosity=0)): 
    kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=2)
    
    if random:
        grid = RandomizedSearchCV(model, params, cv=kfold, n_jobs=-1, random_state=2, scoring='accuracy', verbose=10)
    else:
        # Instantiate GridSearchCV as grid_reg
        grid = GridSearchCV(model, params, cv=kfold, n_jobs=-1, scoring='accuracy', verbose=10)
    
    # Fit grid_reg on X_train and y_train
    grid.fit(X, y)

    # Extract best params
    best_params = grid.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = grid.best_score_

    # Print best score
    print("Training score: {:.5f}".format(best_score))
    
    y_pred = grid.predict(X_test)
    print(y_pred)
    
    print('Test score: {:.3f}'.format(accuracy_score(y_pred, y_test)))

### ToDo 4
Otimize os hiperparâmetros que achar conveniente. Compare o tempo de treinamento e a acurácia de cada modelo

* Parâmetros XGBoost -> [link](https://xgboost.readthedocs.io/en/stable/parameter.html)

* Parâmetros LightGBM -> [link](https://lightgbm.readthedocs.io/en/latest/Parameters.html)

In [10]:
# checking imbalanced classes
df_census['income_ >50K'].value_counts()

0    24720
1     7841
Name: income_ >50K, dtype: int64

In [11]:
# resposta - XGBoost
weights = int(24720/7841)
params={'max_depth':[3,5,8,12], 'learning_rate':[0.01, 0.1, 0.3], 
        'gamma':[0.025, 0.05, 0.5],'n_estimators':[30,50,100,200,500]}
model=XGBClassifier(random_state=2, scale_pos_weight=weights,verbosity=0, use_label_encoder=False)
grid_search(params=params, X_train=X_train, y_train=y_train, X_test=X_test,y_test=y_test, model=model)

Fitting 2 folds for each of 180 candidates, totalling 360 fits
Best params: {'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 12, 'n_estimators': 500}
Training score: 0.85139
[0 0 1 ... 0 1 1]
Test score: 0.967


In [12]:
weights = int(24720/7841)
params={'max_depth':[3,5,8,12,15], 'learning_rate':[0.01, 0.1, 0.2, 0.3], 'n_estimators':[30,50,100,200,500],
       'num_leaves':[31,39,45,51,57]}
model_lgb=LGBMClassifier(random_state=2, scale_pos_weight=weights,verbosity=-1, objective='binary')
grid_search(params=params, X_train=X_train, y_train=y_train, X_test=X_test,y_test=y_test, model=model_lgb)

Fitting 2 folds for each of 500 candidates, totalling 1000 fits
[CV 1/2; 4/180] START gamma=0.025, learning_rate=0.01, max_depth=3, n_estimators=200
[CV 1/2; 4/180] END gamma=0.025, learning_rate=0.01, max_depth=3, n_estimators=200;, score=0.781 total time=   9.2s
[CV 1/2; 10/180] START gamma=0.025, learning_rate=0.01, max_depth=5, n_estimators=500
[CV 1/2; 10/180] END gamma=0.025, learning_rate=0.01, max_depth=5, n_estimators=500;, score=0.815 total time=  43.4s
[CV 2/2; 18/180] START gamma=0.025, learning_rate=0.01, max_depth=12, n_estimators=100
[CV 2/2; 18/180] END gamma=0.025, learning_rate=0.01, max_depth=12, n_estimators=100;, score=0.816 total time=  28.0s
[CV 2/2; 21/180] START gamma=0.025, learning_rate=0.1, max_depth=3, n_estimators=30
[CV 2/2; 21/180] END gamma=0.025, learning_rate=0.1, max_depth=3, n_estimators=30;, score=0.800 total time=   3.3s
[CV 2/2; 22/180] START gamma=0.025, learning_rate=0.1, max_depth=3, n_estimators=50
[CV 2/2; 22/180] END gamma=0.025, learning_r

#### ToDo 5

Apenas para LightGBM. Use os parametros encontrados pelo boosting e treine 4 versoes diferentes usando os diferentes tipos de boosting (gbdt, goss, dart e rf).

* Obs: quando boosting_type=rf, use dois parametros adicionais

    -> bagging_freq=1
    
    -> bagging_fraction=0.8

In [13]:
#GBDT
model_gbdt=LGBMClassifier(random_state=2, scale_pos_weight=weights,verbosity=-1, objective='binary', 
                        learning_rate=0.01, max_depth=8, n_estimators=100, num_leaves=51)
start = datetime.now() 
model_gbdt.fit(X_train, y_train)
stop = datetime.now()

y_pred = model_gbdt.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))
print('Tempo de treinamento: ' + str(stop-start))


Score: 0.8581255374032674
Tempo de treinamento: 0:00:00.889823


In [14]:
#Goss
model_goss=LGBMClassifier(random_state=2, scale_pos_weight=weights,verbosity=-1, objective='binary', 
                        learning_rate=0.01, max_depth=8, n_estimators=100, num_leaves=51, boosting_type='goss')
start = datetime.now() 
model_goss.fit(X_train, y_train)
stop = datetime.now()

y_pred = model_goss.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))
print('Tempo de treinamento: ' + str(stop-start))

Score: 0.8581255374032674
Tempo de treinamento: 0:00:00.916392


In [15]:
#DART
model_dart=LGBMClassifier(random_state=2, scale_pos_weight=weights,verbosity=-1, objective='binary', 
                        learning_rate=0.01, max_depth=8, n_estimators=100, num_leaves=51, boosting_type='dart')
start = datetime.now() 
model_dart.fit(X_train, y_train)
stop = datetime.now()

y_pred = model_dart.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))
print('Tempo de treinamento: ' + str(stop-start))

Score: 0.8595995577938829
Tempo de treinamento: 0:00:01.660431


In [16]:
#RF
model_rf=LGBMClassifier(random_state=2, scale_pos_weight=weights,verbosity=-1, objective='binary', 
                        learning_rate=0.01, max_depth=8, n_estimators=100, num_leaves=51, boosting_type='rf', bagging_freq=1,
                       bagging_fraction=0.8)
start = datetime.now() 
model_rf.fit(X_train, y_train)
stop = datetime.now()

y_pred = model_rf.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))
print('Tempo de treinamento: ' + str(stop-start))

Score: 0.7984277115833436
Tempo de treinamento: 0:00:00.994566
[CV 2/2; 124/500] START learning_rate=0.01, max_depth=15, n_estimators=500, num_leaves=51
[CV 2/2; 124/500] END learning_rate=0.01, max_depth=15, n_estimators=500, num_leaves=51;, score=0.833 total time=   5.9s
[CV 2/2; 137/500] START learning_rate=0.1, max_depth=3, n_estimators=100, num_leaves=39
[CV 2/2; 137/500] END learning_rate=0.1, max_depth=3, n_estimators=100, num_leaves=39;, score=0.823 total time=   0.5s
[CV 2/2; 140/500] START learning_rate=0.1, max_depth=3, n_estimators=100, num_leaves=57
[CV 2/2; 140/500] END learning_rate=0.1, max_depth=3, n_estimators=100, num_leaves=57;, score=0.823 total time=   0.5s
[CV 2/2; 143/500] START learning_rate=0.1, max_depth=3, n_estimators=200, num_leaves=45
[CV 2/2; 143/500] END learning_rate=0.1, max_depth=3, n_estimators=200, num_leaves=45;, score=0.829 total time=   0.8s
[CV 2/2; 146/500] START learning_rate=0.1, max_depth=3, n_estimators=500, num_leaves=31
[CV 2/2; 146/500]