In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import joblib

In [2]:
tr_tr_new = joblib.load('joblib/tr_tr_encoded.joblib')

In [3]:
te_tr_new = joblib.load('joblib/te_tr_encoded.joblib')

In [4]:
from sklearn.model_selection import train_test_split
X = tr_tr_new.drop('isFraud', axis=1)
y = tr_tr_new['isFraud']

In [5]:
# Split the data into training and temp sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temp data into validation and test sets (50% validation, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


Training set size: 472432
Validation set size: 59054
Test set size: 59054


In [8]:
from xgboost import XGBClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

In [9]:
# Xgboost
n_estimators_values = [1000]
learning_rate_values = [1]
gamma_values = [1, 10]
max_depth_values = [6, 20]
min_child_weight_values = [20, 100]
subsample_values = [0.1, 1]
colsample_bytree_values = [0.1, 1]
num_parallel_tree_values = [10]

params_grid = {'n_estimators': n_estimators_values,
                  'learning_rate': learning_rate_values,
                 'gamma': gamma_values,
                 'max_depth': max_depth_values,
                 'min_child_weight': min_child_weight_values,
                 'subsample': subsample_values,
                 'colsample_bytree': colsample_bytree_values,
                 'num_parallel_tree': num_parallel_tree_values}

In [11]:
import warnings

warnings.filterwarnings('ignore')
num_iter = 1
grid_results = pd.DataFrame(columns=('n_estimators',
                                     'learning_rate',
                                     'gamma',
                                     'max_depth',
                                     'min_child_weight',
                                     'subsample',
                                     'colsample_bytree',
                                     'num_parallel_tree',
                                     'best_iteration',
                                     'metric_train',
                                     'metric_val'))

for n_estimators in params_grid['n_estimators']:
    for learning_rate in params_grid['learning_rate']:
        for gamma in params_grid['gamma']:
            for max_depth in params_grid['max_depth']:
                for min_child_weight in params_grid['min_child_weight']:
                    for subsample in params_grid['subsample']:
                        for colsample_bytree in params_grid['colsample_bytree']:
                            for num_parallel_tree in params_grid['num_parallel_tree']:
                                # Print trace
                                print('Iteration = ' + str(num_iter))

                                # [3] Define model
                                model = model_constructor(n_estimators=n_estimators,
                                                          learning_rate=learning_rate,
                                                          gamma=gamma,
                                                          max_depth=max_depth,
                                                          min_child_weight=min_child_weight,
                                                          subsample=subsample,
                                                          colsample_bytree=colsample_bytree,
                                                          num_parallel_tree=num_parallel_tree,
                                                          early_stopping_rounds=10,
                                                          eval_metric="auc",
                                                          device='cuda',
                                                          random_state=0)  # nthread!!!

                                # [4] Train model
                                model.fit(X_train,
                                          y_train,
                                          eval_set=[(X_val, y_val)],
                                          verbose=True)
                                best_iteration = model.best_iteration

                                # [5] Predict
                                pred_train = model.predict_proba(X_train)  # predict_proba!
                                pred_val = model.predict_proba(X_val)  # predict_proba!

                                # [6] Compute metric
                                metric_train = metric(y_train, pred_train[:, 1])
                                metric_val = metric(y_val, pred_val[:, 1])

                                # print error
                                print('AUC train = %.2f - AUC validation = %.2f.'
                                      % (metric_train, metric_val))

                                # Save iteration results
                                grid_results.loc[num_iter] = [n_estimators,
                                                              learning_rate,
                                                              gamma,
                                                              max_depth,
                                                              min_child_weight,
                                                              subsample,
                                                              colsample_bytree,
                                                              num_parallel_tree,
                                                              best_iteration,
                                                              metric_train,
                                                              metric_val]
                                num_iter += 1

Iteration = 1
[0]	validation_0-auc:0.82731
[1]	validation_0-auc:0.85276
[2]	validation_0-auc:0.86284
[3]	validation_0-auc:0.86929
[4]	validation_0-auc:0.87767
[5]	validation_0-auc:0.88175
[6]	validation_0-auc:0.88456
[7]	validation_0-auc:0.88684
[8]	validation_0-auc:0.88915
[9]	validation_0-auc:0.89037
[10]	validation_0-auc:0.89304
[11]	validation_0-auc:0.89429
[12]	validation_0-auc:0.89587
[13]	validation_0-auc:0.89685
[14]	validation_0-auc:0.89803
[15]	validation_0-auc:0.89876
[16]	validation_0-auc:0.89959
[17]	validation_0-auc:0.90029
[18]	validation_0-auc:0.90058
[19]	validation_0-auc:0.90090
[20]	validation_0-auc:0.90090
[21]	validation_0-auc:0.90100
[22]	validation_0-auc:0.90154
[23]	validation_0-auc:0.90230
[24]	validation_0-auc:0.90257
[25]	validation_0-auc:0.90290
[26]	validation_0-auc:0.90380
[27]	validation_0-auc:0.90389
[28]	validation_0-auc:0.90459
[29]	validation_0-auc:0.90471
[30]	validation_0-auc:0.90520
[31]	validation_0-auc:0.90579
[32]	validation_0-auc:0.90591
[33]	v

In [12]:
grid_results_xgb = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model_xgb = grid_results_xgb.iloc[0]
best_model_xgb

n_estimators         1000.000000
learning_rate           1.000000
gamma                   1.000000
max_depth              20.000000
min_child_weight       20.000000
subsample               1.000000
colsample_bytree        0.100000
num_parallel_tree      10.000000
best_iteration         48.000000
metric_train            0.984762
metric_val              0.956512
Name: 11, dtype: float64

In [15]:
# [3] define model
model = model_constructor(n_estimators = int(best_model_xgb.best_iteration),
                          learning_rate = best_model_xgb.learning_rate,
                          gamma = best_model_xgb.gamma,
                          max_depth = int(best_model_xgb.max_depth),
                          min_child_weight = best_model_xgb.min_child_weight,
                          subsample = best_model_xgb.subsample,
                          colsample_bytree = best_model_xgb.colsample_bytree,
                          num_parallel_tree = int(best_model_xgb.num_parallel_tree),
                          device='cuda',
                          eval_metric="auc",
                          random_state = 0)

In [16]:
model.fit(X_train,
          y_train,
          eval_set=[(X_val, y_val)],
          verbose=True)

[0]	validation_0-auc:0.86343
[1]	validation_0-auc:0.88873
[2]	validation_0-auc:0.90018
[3]	validation_0-auc:0.90752
[4]	validation_0-auc:0.91635
[5]	validation_0-auc:0.92173
[6]	validation_0-auc:0.92629
[7]	validation_0-auc:0.92983
[8]	validation_0-auc:0.93275
[9]	validation_0-auc:0.93493
[10]	validation_0-auc:0.93784
[11]	validation_0-auc:0.94050
[12]	validation_0-auc:0.94323
[13]	validation_0-auc:0.94469
[14]	validation_0-auc:0.94552
[15]	validation_0-auc:0.94674
[16]	validation_0-auc:0.94786
[17]	validation_0-auc:0.94886
[18]	validation_0-auc:0.94950
[19]	validation_0-auc:0.94993
[20]	validation_0-auc:0.95054
[21]	validation_0-auc:0.95115
[22]	validation_0-auc:0.95193
[23]	validation_0-auc:0.95238
[24]	validation_0-auc:0.95282
[25]	validation_0-auc:0.95315
[26]	validation_0-auc:0.95362
[27]	validation_0-auc:0.95380
[28]	validation_0-auc:0.95409
[29]	validation_0-auc:0.95413
[30]	validation_0-auc:0.95440
[31]	validation_0-auc:0.95470
[32]	validation_0-auc:0.95495
[33]	validation_0-au

In [18]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1])
auc_val = metric(y_val, pred_val_p[:,1])
auc_test = metric(y_test, pred_test_p[:,1])
# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (auc_train, auc_val, auc_test))

Metric train = 0.9846 - Metric val = 0.9564 - Metric test = 0.9618


In [19]:
pred_te_tr_new_p = model.predict_proba(te_tr_new)[:, 1]

In [20]:
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save to CSV
output_df.to_csv('predicted_fraud[0.9618].csv', index=False)