# Gradient Boosting Classifier

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# datasets
X_train = pd.read_csv('.\data\csv_data\X_TR_561_SENZA_OUTLIERS.csv')
X_test = pd.read_csv('.\data\csv_data\X_test.csv')
y_train = pd.read_csv('.\data\csv_data\Y_TR_561_SENZA_OUTLIERS.csv')
y_test = pd.read_csv('.\data\csv_data\y_test.csv')

In [3]:
# Target variable from integer to categorical
y_train['Label'] = pd.Categorical(y_train["Label"])
y_test['Label'] = pd.Categorical(y_test["Label"])

In [4]:
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

X_train:  (7278, 561)
y_train:  (7278, 1)
X_test:  (2947, 561)
y_test:  (2947, 1)


## GRADIENT BOOSTING 

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.91      0.93      0.92       496
           2       0.92      0.86      0.89       471
           3       0.90      0.94      0.92       420
           4       0.91      0.83      0.87       491
           5       0.86      0.93      0.89       532
           6       1.00      1.00      1.00       537

    accuracy                           0.91      2947
   macro avg       0.91      0.91      0.91      2947
weighted avg       0.92      0.91      0.91      2947



In [14]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.93      0.98      0.96       496
           2       0.93      0.92      0.92       471
           3       0.97      0.92      0.94       420
           4       0.92      0.84      0.88       491
           5       0.86      0.94      0.90       532
           6       1.00      1.00      1.00       537

    accuracy                           0.93      2947
   macro avg       0.94      0.93      0.93      2947
weighted avg       0.94      0.93      0.93      2947



## HistGradientBoosting

In [8]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [13]:
clf = HistGradientBoostingClassifier(loss='categorical_crossentropy',learning_rate=0.1, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.91      0.99      0.95       496
           2       0.94      0.91      0.92       471
           3       0.98      0.92      0.95       420
           4       0.93      0.84      0.88       491
           5       0.86      0.94      0.90       532
           6       1.00      1.00      1.00       537

    accuracy                           0.93      2947
   macro avg       0.94      0.93      0.93      2947
weighted avg       0.94      0.93      0.93      2947



In [15]:
clf = HistGradientBoostingClassifier(loss='categorical_crossentropy',learning_rate=0.1, max_depth=3, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.95      0.99      0.97       496
           2       0.93      0.94      0.94       471
           3       0.98      0.93      0.95       420
           4       0.93      0.86      0.90       491
           5       0.88      0.95      0.91       532
           6       1.00      1.00      1.00       537

    accuracy                           0.95      2947
   macro avg       0.95      0.94      0.94      2947
weighted avg       0.95      0.95      0.94      2947



## XG BOOST

In [16]:
from xgboost import XGBClassifier

In [17]:
clf = XGBClassifier(objective='multi:softprob', 
                    max_depth = 6,
                    learning_rate = 0.1,
                    gamma = 0.0,
                    tree_method='exact', # 'approx'
                    use_label_encoder=True,
                    random_state=42
                   )
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.91      0.97      0.94       496
           2       0.93      0.90      0.91       471
           3       0.96      0.91      0.94       420
           4       0.92      0.83      0.87       491
           5       0.86      0.94      0.90       532
           6       1.00      1.00      1.00       537

    accuracy                           0.93      2947
   macro avg       0.93      0.93      0.93      2947
weighted avg       0.93      0.93      0.93      2947



In [18]:
from lightgbm import LGBMClassifier

In [24]:
clf = LGBMClassifier(boosting_type='gbdt',  #'goss', #'dart'
                     max_depth=6, # no limit
                     num_leaves=31,
                     n_estimators=100,
                     subsample_for_bin=200000,
                     objective='binary',
                     reg_alpha=0.0, #L1 regularization term on weights
                     reg_lambda=0.0, #L2 regularization term on weights
                     random_state=42
                   )
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.93      0.98      0.96       496
           2       0.94      0.93      0.93       471
           3       0.98      0.93      0.95       420
           4       0.93      0.84      0.88       491
           5       0.87      0.94      0.90       532
           6       1.00      1.00      1.00       537

    accuracy                           0.94      2947
   macro avg       0.94      0.94      0.94      2947
weighted avg       0.94      0.94      0.94      2947



In [20]:
# import re
# X_train = X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
# X_test = X_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))