### **XGBoost**

#### Imports

In [481]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
#import matplotlib.pyplot as plt
#import seaborn as sns

#### Constants

In [482]:
TRAIN_SET = "df_mix0.0.csv"
TEST_SET = "df_mix0.8.csv"

#### Import data

Import training set

In [483]:
train_set = pd.read_csv(TRAIN_SET)
x_train = train_set.drop(['t','z'], axis=1)
y_train = train_set['t']
x_train.head()

Unnamed: 0,x,y
0,0.080293,0.880037
1,1.471276,0.600937
2,-0.108788,0.723767
3,0.945246,0.616437
4,-0.57343,-1.358901


Import shifted test set

In [484]:
test_set = pd.read_csv(TEST_SET)
x_test = test_set.drop(['t', 'z'], axis=1)
y_test = test_set['t']
x_test.head()

Unnamed: 0,x,y
0,5.533062,-0.601499
1,4.08983,1.535552
2,3.843192,1.443486
3,4.69779,0.927367
4,6.115125,0.271682


Decision tree

In [485]:
model = DTC(max_depth=5)
model.fit(x_train, y_train)
print(model.score(x_train, y_train))

0.831


In [486]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.33
              precision    recall  f1-score   support

           0       0.17      0.76      0.28       172
           1       0.83      0.24      0.37       828

    accuracy                           0.33      1000
   macro avg       0.50      0.50      0.33      1000
weighted avg       0.71      0.33      0.36      1000

AUC: 0.5076185260083137


Gradient boosting

In [487]:
model = GBC(learning_rate=0.025, max_depth=5, n_estimators=100, subsample=0.7)
model.fit(x_train, y_train)
print("score:", model.score(x_train, y_train))

score: 0.863


In [488]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.457
              precision    recall  f1-score   support

           0       0.17      0.55      0.26       172
           1       0.82      0.44      0.57       828

    accuracy                           0.46      1000
   macro avg       0.50      0.49      0.42      1000
weighted avg       0.71      0.46      0.52      1000

AUC: 0.5037250028086732


In [489]:
model = xgb.XGBClassifier(learning_rate=0.025, max_depth=5, n_estimators=100, subsample=0.7)
model.fit(x_train, y_train)
print("score:", model.score(x_train, y_train))

score: 0.828


In [490]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.797
              precision    recall  f1-score   support

           0       0.20      0.06      0.09       172
           1       0.83      0.95      0.89       828

    accuracy                           0.80      1000
   macro avg       0.51      0.50      0.49      1000
weighted avg       0.72      0.80      0.75      1000

AUC: 0.5096372598584429


Grid search over parameters

In [491]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [2, 3, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'learning_rate': [0.025, 0.05, 0.075],
    'subsample': [0.6, 0.7, 0.8]
}

def grid_search(x_train, y_train, x_test, y_test):
    model = GradientBoostingClassifier()
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    grid.fit(x_train, y_train)

    best_params = grid.best_params_
    best_cv_score = grid.best_score_

    print("Migliori parametri basati sulla cross-validation:", best_params)
    print("Miglior punteggio di CV:", best_cv_score)

    final_model = GBC(**best_params)
    final_model.fit(x_train, y_train)

    y_pred = final_model.predict(x_test)
    y_pred_proba = final_model.predict_proba(x_test)[:, 1]

    test_score = final_model.score(x_test, y_test)
    test_auc = roc_auc_score(y_test, y_pred_proba)

    print("Punteggio sul set di test:", test_score)
    print("AUC sul set di test:", test_auc)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return final_model

# Esempio di utilizzo
#final_model = grid_search(x_train, y_train, x_test, y_test)