In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import joblib

In [22]:
tr_tr_new = joblib.load('joblib/tr_tr_encoded.joblib')

In [23]:
te_tr_new = joblib.load('joblib/te_tr_encoded.joblib')

In [24]:
from sklearn.model_selection import train_test_split
X = tr_tr_new.drop('isFraud', axis=1)
y = tr_tr_new['isFraud']

In [25]:
# Split the data into training and temp sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temp data into validation and test sets (50% validation, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


Training set size: 472432
Validation set size: 59054
Test set size: 59054


In [26]:
from xgboost import XGBClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

In [27]:
# Xgboost
n_estimators_values = [2000]
learning_rate_values = [1]
gamma_values = [1, 10]
max_depth_values = [18, 20,30]
min_child_weight_values = [15, 20, 30]
subsample_values = [0.1, 1]
colsample_bytree_values = [0.1, 1]
num_parallel_tree_values = [10]

params_grid = {'n_estimators': n_estimators_values,
                  'learning_rate': learning_rate_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values}

In [28]:
import warnings

warnings.filterwarnings('ignore')
num_iter = 1
grid_results = pd.DataFrame(columns=('n_estimators',
                                     'learning_rate',
                                     'gamma',
                                     'max_depth',
                                     'min_child_weight',
                                     'subsample',
                                     'colsample_bytree',
                                     'num_parallel_tree',
                                     'best_iteration',
                                     'metric_train',
                                     'metric_val'))

for n_estimators in params_grid['n_estimators']:
    for learning_rate in params_grid['learning_rate']:
        for gamma in params_grid['gamma']:
            for max_depth in params_grid['max_depth']:
                for min_child_weight in params_grid['min_child_weight']:
                    for subsample in params_grid['subsample']:
                        for colsample_bytree in params_grid['colsample_bytree']:
                            for num_parallel_tree in params_grid['num_parallel_tree']:
                                # Print trace
                                print('Iteration = ' + str(num_iter))

                                # [3] Define model
                                model = model_constructor(n_estimators=n_estimators,
                                                          learning_rate=learning_rate,
                                                          gamma=gamma,
                                                          max_depth=max_depth,
                                                          min_child_weight=min_child_weight,
                                                          subsample=subsample,
                                                          colsample_bytree=colsample_bytree,
                                                          num_parallel_tree=num_parallel_tree,
                                                          early_stopping_rounds=10,
                                                          eval_metric="auc",
                                                          device='cuda',
                                                          random_state=0)  # nthread!!!

                                # [4] Train model
                                model.fit(X_train,
                                          y_train,
                                          eval_set=[(X_val, y_val)],
                                          verbose=True)
                                best_iteration = model.best_iteration

                                # [5] Predict
                                pred_train = model.predict_proba(X_train)  # predict_proba!
                                pred_val = model.predict_proba(X_val)  # predict_proba!

                                # [6] Compute metric
                                metric_train = metric(y_train, pred_train[:, 1])
                                metric_val = metric(y_val, pred_val[:, 1])

                                # print error
                                print('AUC train = %.2f - AUC validation = %.2f.'
                                      % (metric_train, metric_val))

                                # Save iteration results
                                grid_results.loc[num_iter] = [n_estimators,
                                                              learning_rate,
                                                              gamma,
                                                              max_depth,
                                                              min_child_weight,
                                                              subsample,
                                                              colsample_bytree,
                                                              num_parallel_tree,
                                                              best_iteration,
                                                              metric_train,
                                                              metric_val]
                                num_iter += 1

Iteration = 1
[0]	validation_0-auc:0.83852
[1]	validation_0-auc:0.86109
[2]	validation_0-auc:0.87272
[3]	validation_0-auc:0.87850
[4]	validation_0-auc:0.88582
[5]	validation_0-auc:0.88956
[6]	validation_0-auc:0.89296
[7]	validation_0-auc:0.89519
[8]	validation_0-auc:0.89698
[9]	validation_0-auc:0.89873
[10]	validation_0-auc:0.90137
[11]	validation_0-auc:0.90234
[12]	validation_0-auc:0.90386
[13]	validation_0-auc:0.90563
[14]	validation_0-auc:0.90625
[15]	validation_0-auc:0.90713
[16]	validation_0-auc:0.90821
[17]	validation_0-auc:0.90880
[18]	validation_0-auc:0.90918
[19]	validation_0-auc:0.90944
[20]	validation_0-auc:0.90964
[21]	validation_0-auc:0.91009
[22]	validation_0-auc:0.91084
[23]	validation_0-auc:0.91152
[24]	validation_0-auc:0.91214
[25]	validation_0-auc:0.91261
[26]	validation_0-auc:0.91314
[27]	validation_0-auc:0.91302
[28]	validation_0-auc:0.91391
[29]	validation_0-auc:0.91379
[30]	validation_0-auc:0.91449
[31]	validation_0-auc:0.91524
[32]	validation_0-auc:0.91565
[33]	v

In [29]:
grid_results_xgb = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model_xgb = grid_results_xgb.iloc[0]
best_model_xgb

n_estimators         2000.000000
learning_rate           1.000000
gamma                   1.000000
max_depth              30.000000
min_child_weight       15.000000
subsample               1.000000
colsample_bytree        0.100000
num_parallel_tree      10.000000
best_iteration         45.000000
metric_train            0.989194
metric_val              0.959039
Name: 27, dtype: float64

In [30]:
# [3] define model
model = model_constructor(n_estimators = int(best_model_xgb.best_iteration),
                          learning_rate = best_model_xgb.learning_rate,
                          gamma = best_model_xgb.gamma,
                          max_depth = int(best_model_xgb.max_depth),
                          min_child_weight = best_model_xgb.min_child_weight,
                          subsample = best_model_xgb.subsample,
                          colsample_bytree = best_model_xgb.colsample_bytree,
                          num_parallel_tree = int(best_model_xgb.num_parallel_tree),
                          device='cuda',
                          eval_metric="auc",
                          random_state = 0)

In [31]:
model.fit(X_train,
          y_train,
          eval_set=[(X_val, y_val)],
          verbose=True)

[0]	validation_0-auc:0.86781
[1]	validation_0-auc:0.89429
[2]	validation_0-auc:0.90530
[3]	validation_0-auc:0.91235
[4]	validation_0-auc:0.92134
[5]	validation_0-auc:0.92710
[6]	validation_0-auc:0.93152
[7]	validation_0-auc:0.93525
[8]	validation_0-auc:0.93816
[9]	validation_0-auc:0.94049
[10]	validation_0-auc:0.94345
[11]	validation_0-auc:0.94576
[12]	validation_0-auc:0.94820
[13]	validation_0-auc:0.94964
[14]	validation_0-auc:0.95020
[15]	validation_0-auc:0.95122
[16]	validation_0-auc:0.95231
[17]	validation_0-auc:0.95328
[18]	validation_0-auc:0.95383
[19]	validation_0-auc:0.95431
[20]	validation_0-auc:0.95460
[21]	validation_0-auc:0.95518
[22]	validation_0-auc:0.95566
[23]	validation_0-auc:0.95594
[24]	validation_0-auc:0.95610
[25]	validation_0-auc:0.95622
[26]	validation_0-auc:0.95635
[27]	validation_0-auc:0.95679
[28]	validation_0-auc:0.95708
[29]	validation_0-auc:0.95721
[30]	validation_0-auc:0.95750
[31]	validation_0-auc:0.95782
[32]	validation_0-auc:0.95813
[33]	validation_0-au

In [32]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1])
auc_val = metric(y_val, pred_val_p[:,1])
auc_test = metric(y_test, pred_test_p[:,1])
# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (auc_train, auc_val, auc_test))

Metric train = 0.9890 - Metric val = 0.9589 - Metric test = 0.9641


In [33]:
pred_te_tr_new_p = model.predict_proba(te_tr_new)[:, 1]

In [34]:
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save to CSV
output_df.to_csv('predicted_fraud[0.9641].csv', index=False)

In [35]:
# [3] define model
model = model_constructor(n_estimators = 5000,
                          learning_rate = 0.02,
                          max_depth = 12,
                          subsample = 0.8,
                          colsample_bytree = 0.4,
                          device='cuda',
                          tree_method='gpu_hist',
                          eval_metric="auc",
                          random_state = 0)

In [36]:
model.fit(X_train,
          y_train,
          eval_set=[(X_val, y_val)],
          verbose=True,
          early_stopping_rounds=100)

[0]	validation_0-auc:0.81914
[1]	validation_0-auc:0.86789
[2]	validation_0-auc:0.87047
[3]	validation_0-auc:0.87335
[4]	validation_0-auc:0.87685
[5]	validation_0-auc:0.87911
[6]	validation_0-auc:0.88312
[7]	validation_0-auc:0.88450
[8]	validation_0-auc:0.88650
[9]	validation_0-auc:0.88754
[10]	validation_0-auc:0.88741
[11]	validation_0-auc:0.88979
[12]	validation_0-auc:0.89107
[13]	validation_0-auc:0.89156
[14]	validation_0-auc:0.89270
[15]	validation_0-auc:0.89261
[16]	validation_0-auc:0.89356
[17]	validation_0-auc:0.89520
[18]	validation_0-auc:0.89604
[19]	validation_0-auc:0.89596
[20]	validation_0-auc:0.89595
[21]	validation_0-auc:0.89816
[22]	validation_0-auc:0.89840
[23]	validation_0-auc:0.89832
[24]	validation_0-auc:0.89864
[25]	validation_0-auc:0.89914
[26]	validation_0-auc:0.89926
[27]	validation_0-auc:0.89978
[28]	validation_0-auc:0.90006
[29]	validation_0-auc:0.90029
[30]	validation_0-auc:0.90105
[31]	validation_0-auc:0.90091
[32]	validation_0-auc:0.90136
[33]	validation_0-au

In [37]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1])
auc_val = metric(y_val, pred_val_p[:,1])
auc_test = metric(y_test, pred_test_p[:,1])
# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (auc_train, auc_val, auc_test))

Metric train = 1.0000 - Metric val = 0.9728 - Metric test = 0.9780


In [38]:
pred_te_tr_new_p = model.predict_proba(te_tr_new)[:, 1]

In [39]:
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save to CSV
output_df.to_csv('predicted_fraud[0.9780].csv', index=False)