# GBM

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, r2_score, accuracy_score
from sklearn.svm import  SVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, ElasticNet
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings
warnings.simplefilter('ignore')
from sklearn.ensemble import VotingClassifier, BaggingClassifier, BaggingRegressor, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, GradientBoostingClassifier
import matplotlib.pyplot as plt

In [2]:
concrete = pd.read_csv("./Cases/Concrete Strength/Concrete_Data.csv")
concrete.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [5]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = GradientBoostingRegressor(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_train,y_train)
            y_pred = gbm.predict(X_test)
            scores.append([i, j , k, r2_score(y_test, y_pred)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','r2_score'])
conc_df.sort_values('r2_score', ascending=False).head()

Unnamed: 0,trees,learning_rate,max_depth,r2_score
28,100,0.4,5.0,0.920467
25,100,0.1,5.0,0.91815
16,50,0.4,5.0,0.917773
27,100,0.4,3.0,0.91278
19,50,0.6,5.0,0.912465


# XGBoost

In [21]:
from xgboost import XGBRegressor, XGBClassifier

In [9]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = XGBRegressor(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_train,y_train)
            y_pred = gbm.predict(X_test)
            scores.append([i, j , k, r2_score(y_test, y_pred)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','r2_score'])
conc_df.sort_values('r2_score', ascending=False).head()

Unnamed: 0,trees,learning_rate,max_depth,r2_score
28,100,0.4,5.0,0.913853
25,100,0.1,5.0,0.912739
26,100,0.1,,0.9127
16,50,0.4,5.0,0.912285
27,100,0.4,3.0,0.90836


# LightGBM

In [22]:
from lightgbm import LGBMRegressor, LGBMClassifier

In [13]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = LGBMRegressor(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_train,y_train)
            y_pred = gbm.predict(X_test)
            scores.append([i, j , k, r2_score(y_test, y_pred)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','r2_score'])
conc_df.sort_values('r2_score', ascending=False).head()

Unnamed: 0,trees,learning_rate,max_depth,r2_score
28,100,0.4,5.0,0.923993
31,100,0.6,5.0,0.922578
20,50,0.6,,0.922365
32,100,0.6,,0.921685
19,50,0.6,5.0,0.920273


# CatBoost

In [25]:
from catboost import CatBoostRegressor, CatBoostClassifier

In [15]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = CatBoostRegressor(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_train,y_train)
            y_pred = gbm.predict(X_test)
            scores.append([i, j , k, r2_score(y_test, y_pred)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','r2_score'])
conc_df.sort_values('r2_score', ascending=False).head()

0:	learn: 16.0801163	total: 134ms	remaining: 3.23s
1:	learn: 15.2422146	total: 135ms	remaining: 1.56s
2:	learn: 14.4893882	total: 136ms	remaining: 998ms
3:	learn: 13.7772427	total: 137ms	remaining: 718ms
4:	learn: 13.2067726	total: 137ms	remaining: 549ms
5:	learn: 12.7034941	total: 138ms	remaining: 437ms
6:	learn: 12.2469625	total: 138ms	remaining: 356ms
7:	learn: 11.9039668	total: 139ms	remaining: 296ms
8:	learn: 11.5322344	total: 140ms	remaining: 248ms
9:	learn: 11.2005270	total: 140ms	remaining: 210ms
10:	learn: 10.8100917	total: 141ms	remaining: 179ms
11:	learn: 10.5586987	total: 142ms	remaining: 153ms
12:	learn: 10.2124124	total: 142ms	remaining: 131ms
13:	learn: 9.9692681	total: 143ms	remaining: 112ms
14:	learn: 9.7089087	total: 144ms	remaining: 95.8ms
15:	learn: 9.5262648	total: 144ms	remaining: 81.2ms
16:	learn: 9.3436115	total: 145ms	remaining: 68.3ms
17:	learn: 9.1540577	total: 146ms	remaining: 56.7ms
18:	learn: 8.9320150	total: 146ms	remaining: 46.2ms
19:	learn: 8.7437402	to

Unnamed: 0,trees,learning_rate,max_depth,r2_score
29,100,0.4,,0.925809
28,100,0.4,5.0,0.925423
32,100,0.6,,0.920198
30,100,0.6,3.0,0.918126
20,50,0.6,,0.915099


# Classifiers

## HR Dataset

In [16]:
hr = pd.read_csv("./Cases/human-resources-analytics/HR_comma_sep.csv")
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [17]:
X, y = hr.drop('left', axis=1), hr['left']
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore').set_output(transform='pandas')
ct = make_column_transformer((ohe, make_column_selector(dtype_include=object)),("passthrough", make_column_selector(dtype_exclude=object)),verbose_feature_names_out=False).set_output(transform='pandas')
X_ohe = ct.fit_transform(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.3, random_state=24, stratify=y)
X_trn_ohe = ct.fit_transform(X_train)
X_tst_ohe = ct.transform(X_test)

## GBM

In [29]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = GradientBoostingClassifier(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_trn_ohe,y_train)
            y_pred = gbm.predict(X_tst_ohe)
            scores.append([i, j , k, f1_score(y_test, y_pred, pos_label=1)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','f1_score'])
conc_df.sort_values('f1_score', ascending=False).head()

Unnamed: 0,trees,learning_rate,max_depth,f1_score
28,100,0.4,5.0,0.968045
16,50,0.4,5.0,0.966588
4,25,0.4,5.0,0.962435
10,25,1.0,5.0,0.961502
7,25,0.6,5.0,0.960265


## XGBoost

In [28]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = XGBClassifier(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_trn_ohe,y_train)
            y_pred = gbm.predict(X_tst_ohe)
            scores.append([i, j , k, f1_score(y_test, y_pred, pos_label=1)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','f1_score'])
conc_df.sort_values('f1_score', ascending=False).head()

Unnamed: 0,trees,learning_rate,max_depth,f1_score
29,100,0.4,,0.970755
23,50,1.0,,0.969925
35,100,1.0,,0.969868
34,100,1.0,5.0,0.969498
31,100,0.6,5.0,0.969412


## LightGBM

In [27]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = LGBMClassifier(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_trn_ohe,y_train)
            y_pred = gbm.predict(X_tst_ohe)
            scores.append([i, j , k, f1_score(y_test, y_pred, pos_label=1)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','f1_score'])
conc_df.sort_values('f1_score', ascending=False).head()

Unnamed: 0,trees,learning_rate,max_depth,f1_score
29,100,0.4,,0.971213
17,50,0.4,,0.97081
28,100,0.4,5.0,0.970325
31,100,0.6,5.0,0.968985
5,25,0.4,,0.966335


## CatBoost

In [30]:
n_trees = [25,50,100]
learning_rate  = [0.1 , 0.4 , 0.6, 1 ]
max_depth = [3,5,None]
scores = []
for i in n_trees:
    for j in learning_rate:
        for k in max_depth:
            gbm = CatBoostClassifier(random_state=24, n_estimators = i, learning_rate=j, max_depth=k)
            gbm.fit(X_trn_ohe,y_train)
            y_pred = gbm.predict(X_tst_ohe)
            scores.append([i, j , k, f1_score(y_test, y_pred, pos_label=1)])

conc_df = pd.DataFrame(scores, columns=['trees','learning_rate','max_depth','f1_score'])
conc_df.sort_values('f1_score', ascending=False).head()

0:	learn: 0.6314848	total: 2.96ms	remaining: 71ms
1:	learn: 0.5797132	total: 6.34ms	remaining: 72.9ms
2:	learn: 0.5368995	total: 9.58ms	remaining: 70.2ms
3:	learn: 0.4990302	total: 12.8ms	remaining: 67.4ms
4:	learn: 0.4663002	total: 16.3ms	remaining: 65.3ms
5:	learn: 0.4346304	total: 19.7ms	remaining: 62.4ms
6:	learn: 0.4074336	total: 41.9ms	remaining: 108ms
7:	learn: 0.3826554	total: 44ms	remaining: 93.4ms
8:	learn: 0.3624745	total: 46.5ms	remaining: 82.6ms
9:	learn: 0.3437532	total: 48.7ms	remaining: 73ms
10:	learn: 0.3282070	total: 50.4ms	remaining: 64.2ms
11:	learn: 0.3146329	total: 52.2ms	remaining: 56.5ms
12:	learn: 0.3015609	total: 54ms	remaining: 49.9ms
13:	learn: 0.2909773	total: 55.8ms	remaining: 43.8ms
14:	learn: 0.2821174	total: 60.3ms	remaining: 40.2ms
15:	learn: 0.2718657	total: 66ms	remaining: 37.1ms
16:	learn: 0.2599161	total: 69.5ms	remaining: 32.7ms
17:	learn: 0.2490543	total: 73.7ms	remaining: 28.7ms
18:	learn: 0.2439148	total: 75.5ms	remaining: 23.8ms
19:	learn: 0.2

Unnamed: 0,trees,learning_rate,max_depth,f1_score
35,100,1.0,,0.962963
32,100,0.6,,0.960457
29,100,0.4,,0.956023
31,100,0.6,5.0,0.95474
34,100,1.0,5.0,0.954631
