In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import joblib

In [2]:
tr_tr_new = joblib.load('../joblib/tr_tr_encoded.joblib')

In [3]:
te_tr_new = joblib.load('../joblib/te_tr_encoded.joblib')

In [4]:
from sklearn.model_selection import train_test_split
X = tr_tr_new.drop('isFraud', axis=1)
y = tr_tr_new['isFraud']

In [5]:
# Split the data into training and temp sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temp data into validation and test sets (50% validation, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


Training set size: 472432
Validation set size: 59054
Test set size: 59054


In [6]:
from xgboost import XGBClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

In [7]:
n_estimators_values = [1000, 2000]
learning_rate_values = [0.02,0.05]
max_depth_values = [12, 18]
min_child_weight_values = [15]
subsample_values = [0.8]
colsample_bytree_values = [0.2,0.4]



params_grid = {'n_estimators': n_estimators_values,
                  'learning_rate': learning_rate_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values}

In [8]:
import warnings

warnings.filterwarnings('ignore')
num_iter = 1
grid_results = pd.DataFrame(columns=('n_estimators',
                                     'learning_rate',
                                     'max_depth',
                                     'min_child_weight',
                                     'subsample',
                                     'colsample_bytree',
                                     'best_iteration',
                                     'metric_train',
                                     'metric_val'))

for n_estimators in params_grid['n_estimators']:
    for learning_rate in params_grid['learning_rate']:
        for max_depth in params_grid['max_depth']:
            for min_child_weight in params_grid['min_child_weight']:
                for subsample in params_grid['subsample']:
                    for colsample_bytree in params_grid['colsample_bytree']:
                        # Print trace
                        print('Iteration = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(n_estimators=n_estimators,
                                                  learning_rate=learning_rate,
                                                  max_depth=max_depth,
                                                  min_child_weight=min_child_weight,
                                                  subsample=subsample,
                                                  colsample_bytree=colsample_bytree,
                                                  early_stopping_rounds=20,
                                                  eval_metric="auc",
                                                  tree_method='gpu_hist',
                                                  device='cuda',
                                                  random_state=0)  # nthread!!!

                        # [4] Train model
                        model.fit(X_train,
                                  y_train,
                                  eval_set=[(X_val, y_val)],
                                  verbose=True)
                        best_iteration = model.best_iteration

                        # [5] Predict
                        pred_train = model.predict_proba(X_train)  # predict_proba!
                        pred_val = model.predict_proba(X_val)  # predict_proba!

                        # [6] Compute metric
                        metric_train = metric(y_train, pred_train[:, 1])
                        metric_val = metric(y_val, pred_val[:, 1])

                        # print error
                        print('AUC train = %.2f - AUC validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter] = [n_estimators,
                                                      learning_rate,
                                                      max_depth,
                                                      min_child_weight,
                                                      subsample,
                                                      colsample_bytree,
                                                      best_iteration,
                                                      metric_train,
                                                      metric_val]
                        num_iter += 1

Iteration = 1
[0]	validation_0-auc:0.81979
[1]	validation_0-auc:0.84682
[2]	validation_0-auc:0.85523
[3]	validation_0-auc:0.85743
[4]	validation_0-auc:0.86549
[5]	validation_0-auc:0.86612
[6]	validation_0-auc:0.86717
[7]	validation_0-auc:0.87004
[8]	validation_0-auc:0.87154
[9]	validation_0-auc:0.87115
[10]	validation_0-auc:0.87386
[11]	validation_0-auc:0.87571
[12]	validation_0-auc:0.87809
[13]	validation_0-auc:0.87929
[14]	validation_0-auc:0.88016
[15]	validation_0-auc:0.88058
[16]	validation_0-auc:0.88141
[17]	validation_0-auc:0.88192
[18]	validation_0-auc:0.88197
[19]	validation_0-auc:0.88165
[20]	validation_0-auc:0.88267
[21]	validation_0-auc:0.88247
[22]	validation_0-auc:0.88256
[23]	validation_0-auc:0.88377
[24]	validation_0-auc:0.88435
[25]	validation_0-auc:0.88436
[26]	validation_0-auc:0.88476
[27]	validation_0-auc:0.88524
[28]	validation_0-auc:0.88616
[29]	validation_0-auc:0.88621
[30]	validation_0-auc:0.88588
[31]	validation_0-auc:0.88597
[32]	validation_0-auc:0.88625
[33]	v

In [9]:
grid_results_xgb = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model_xgb = grid_results_xgb.iloc[0]
best_model_xgb

n_estimators        2000.000000
learning_rate          0.020000
max_depth             18.000000
min_child_weight      15.000000
subsample              0.800000
colsample_bytree       0.400000
best_iteration      1950.000000
metric_train           0.996345
metric_val             0.968443
Name: 12, dtype: float64

In [10]:
# [3] define model
model = model_constructor(n_estimators = int(best_model_xgb.best_iteration),
                          learning_rate = best_model_xgb.learning_rate,
                          max_depth = int(best_model_xgb.max_depth),
                          min_child_weight = best_model_xgb.min_child_weight,
                          subsample = best_model_xgb.subsample,
                          colsample_bytree = best_model_xgb.colsample_bytree,
                          device='cuda',
                          eval_metric="auc",
                          tree_method='hist',
                          early_stopping_rounds=20,
                          random_state = 0)

In [11]:
model.fit(X_train,
          y_train,
          eval_set=[(X_val, y_val)],
          verbose=True)

[0]	validation_0-auc:0.83690
[1]	validation_0-auc:0.87729
[2]	validation_0-auc:0.88315
[3]	validation_0-auc:0.88701
[4]	validation_0-auc:0.89094
[5]	validation_0-auc:0.89331
[6]	validation_0-auc:0.89543
[7]	validation_0-auc:0.89667
[8]	validation_0-auc:0.89803
[9]	validation_0-auc:0.89984
[10]	validation_0-auc:0.89997
[11]	validation_0-auc:0.90170
[12]	validation_0-auc:0.90252
[13]	validation_0-auc:0.90318
[14]	validation_0-auc:0.90349
[15]	validation_0-auc:0.90334
[16]	validation_0-auc:0.90358
[17]	validation_0-auc:0.90479
[18]	validation_0-auc:0.90574
[19]	validation_0-auc:0.90579
[20]	validation_0-auc:0.90581
[21]	validation_0-auc:0.90634
[22]	validation_0-auc:0.90657
[23]	validation_0-auc:0.90670
[24]	validation_0-auc:0.90747
[25]	validation_0-auc:0.90723
[26]	validation_0-auc:0.90740
[27]	validation_0-auc:0.90797
[28]	validation_0-auc:0.90819
[29]	validation_0-auc:0.90832
[30]	validation_0-auc:0.90834
[31]	validation_0-auc:0.90813
[32]	validation_0-auc:0.90818
[33]	validation_0-au

In [12]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1])
auc_val = metric(y_val, pred_val_p[:,1])
auc_test = metric(y_test, pred_test_p[:,1])
# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (auc_train, auc_val, auc_test))

Metric train = 0.9963 - Metric val = 0.9684 - Metric test = 0.9728


In [13]:
pred_te_tr_new_p = model.predict_proba(te_tr_new)[:, 1]

In [14]:
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save to CSV
output_df.to_csv('xgboost[0.9728].csv', index=False)

In [15]:
filename = 'xgboost_model[0.9728].joblib'
joblib.dump(model, filename)

['xgboost_model[0.9728].joblib']