### **XGBoost**

#### Imports

In [11]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
#import matplotlib.pyplot as plt
#import seaborn as sns

#### Constants

In [12]:
TRAIN_SET = "myBeautifulNBs/data_folder/original.csv"
TEST_SET = "myBeautifulNBs/data_folder/shifted_01.csv"

#### Import data

Import training set

In [13]:
train_set = pd.read_csv(TRAIN_SET)
x_train = train_set.drop(['target'], axis=1)
y_train = train_set['target']
x_train.head()

Unnamed: 0,feature_1,feature_2
0,-6.229601,17.698271
1,4.702409,19.450165
2,27.860303,21.176254
3,2.751841,19.405003
4,13.082308,19.98289


Import shifted test set

In [14]:
test_set = pd.read_csv(TEST_SET)
x_test = test_set.drop(['target'], axis=1)
y_test = test_set['target']
x_test.head()

Unnamed: 0,feature_1,feature_2
0,21.405354,22.923014
1,3.576095,48.079939
2,42.525733,30.251486
3,25.684245,36.618175
4,16.509675,42.748147


Decision tree

In [15]:
model = DTC(max_depth=5)
model.fit(x_train, y_train)
print(model.score(x_train, y_train))

0.999


In [16]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.688
              precision    recall  f1-score   support

           0       1.00      0.50      0.66       618
           1       0.55      1.00      0.71       382

    accuracy                           0.69      1000
   macro avg       0.78      0.75      0.69      1000
weighted avg       0.83      0.69      0.68      1000

AUC: 0.7469120113861638


Gradient boosting

In [17]:
model = GBC(learning_rate=0.025, max_depth=5, n_estimators=100, subsample=0.7)
model.fit(x_train, y_train)
print("score:", model.score(x_train, y_train))

score: 1.0


In [18]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.688
              precision    recall  f1-score   support

           0       1.00      0.50      0.66       618
           1       0.55      1.00      0.71       382

    accuracy                           0.69      1000
   macro avg       0.78      0.75      0.69      1000
weighted avg       0.83      0.69      0.68      1000

AUC: 0.9067800199935614


In [19]:
model = xgb.XGBClassifier(learning_rate=0.025, max_depth=5, n_estimators=100, subsample=0.7)
model.fit(x_train, y_train)
print("score:", model.score(x_train, y_train))

score: 0.996


In [20]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.687
              precision    recall  f1-score   support

           0       1.00      0.49      0.66       618
           1       0.55      1.00      0.71       382

    accuracy                           0.69      1000
   macro avg       0.77      0.75      0.69      1000
weighted avg       0.83      0.69      0.68      1000

AUC: 0.8700799742455819


Grid search over parameters

In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [2, 3, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'learning_rate': [0.025, 0.05, 0.075],
    'subsample': [0.6, 0.7, 0.8]
}

def grid_search(x_train, y_train, x_test, y_test):
    model = GradientBoostingClassifier()
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    grid.fit(x_train, y_train)

    best_params = grid.best_params_
    best_cv_score = grid.best_score_

    print("Migliori parametri basati sulla cross-validation:", best_params)
    print("Miglior punteggio di CV:", best_cv_score)

    final_model = GBC(**best_params)
    final_model.fit(x_train, y_train)

    y_pred = final_model.predict(x_test)
    y_pred_proba = final_model.predict_proba(x_test)[:, 1]

    test_score = final_model.score(x_test, y_test)
    test_auc = roc_auc_score(y_test, y_pred_proba)

    print("Punteggio sul set di test:", test_score)
    print("AUC sul set di test:", test_auc)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return final_model

# Esempio di utilizzo
#final_model = grid_search(x_train, y_train, x_test, y_test)