In [111]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import joblib

In [112]:
tr_tr_new = joblib.load('joblib/tr_tr_encoded.joblib')

In [113]:
te_tr_new = joblib.load('joblib/te_tr_encoded.joblib')

In [114]:
print(tr_tr_new.shape, te_tr_new.shape)

(590540, 215) (506691, 214)


In [115]:
tr_tr_new.columns[tr_tr_new.isnull().any()].tolist()

['D5',
 'D9',
 'D8',
 'dist1',
 'D3',
 'D12',
 'D7',
 'D15',
 'D1',
 'D11',
 'D14',
 'dist2',
 'D10',
 'D2',
 'D13',
 'D4',
 'D6',
 'V281',
 'V283',
 'V289',
 'V296',
 'V301',
 'V314',
 'V1',
 'V3',
 'V4',
 'V6',
 'V8',
 'V11',
 'V13',
 'V14',
 'V17',
 'V20',
 'V23',
 'V26',
 'V27',
 'V30',
 'V36',
 'V37',
 'V40',
 'V41',
 'V44',
 'V47',
 'V48',
 'V54',
 'V56',
 'V59',
 'V62',
 'V65',
 'V67',
 'V68',
 'V70',
 'V76',
 'V78',
 'V80',
 'V82',
 'V86',
 'V88',
 'V89',
 'V91',
 'V96',
 'V98',
 'V99',
 'V104',
 'V107',
 'V108',
 'V109',
 'V111',
 'V115',
 'V117',
 'V120',
 'V121',
 'V123',
 'V127',
 'V129',
 'V130',
 'V136',
 'V138',
 'V139',
 'V142',
 'V147',
 'V156',
 'V162',
 'V165',
 'V160',
 'V166',
 'V178',
 'V187',
 'V173',
 'V182',
 'V199',
 'V203',
 'V205',
 'V207',
 'V215',
 'V218',
 'V223',
 'V224',
 'V226',
 'V228',
 'V229',
 'V235',
 'V237',
 'V240',
 'V258',
 'V257',
 'V253',
 'V252',
 'V260',
 'V261',
 'V264',
 'V266',
 'V267',
 'V274',
 'V277',
 'V220',
 'V221',
 'V234',
 'V23

In [116]:
# Fill missing values with median for training data
tr_tr_new.fillna(tr_tr_new.median(), inplace=True)

# Fill missing values with median for testing data
te_tr_new.fillna(tr_tr_new.median(), inplace=True)  # Note: Use training data median for test data to avoid data leakage


In [94]:
tr_tr_new.columns[tr_tr_new.isnull().any()].tolist()

[]

In [95]:
tr_tr_new['isFraud'].value_counts()

isFraud
0    569877
1     20663
Name: count, dtype: int64

In [117]:
from xgboost import XGBClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

In [118]:
from sklearn.model_selection import train_test_split
X = tr_tr_new.drop('isFraud', axis=1)
y = tr_tr_new['isFraud']

In [119]:
# Split the data into training and temp sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temp data into validation and test sets (50% validation, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


Training set size: 472432
Validation set size: 59054
Test set size: 59054


In [120]:
# [3] Define model
model = model_constructor(early_stopping_rounds = 10,
                      n_estimators = 1000,
                      eval_metric = "auc",
                          device='cuda',
                      random_state = 42)

In [121]:
import warnings
warnings.filterwarnings('ignore')

model.fit(X_train,
          y_train,
          eval_set=[(X_val, y_val)],
          verbose=True)

[0]	validation_0-auc:0.78129
[1]	validation_0-auc:0.82027
[2]	validation_0-auc:0.84753
[3]	validation_0-auc:0.85690
[4]	validation_0-auc:0.86068
[5]	validation_0-auc:0.86778
[6]	validation_0-auc:0.87640
[7]	validation_0-auc:0.87999
[8]	validation_0-auc:0.88406
[9]	validation_0-auc:0.88790
[10]	validation_0-auc:0.89203
[11]	validation_0-auc:0.89294
[12]	validation_0-auc:0.89504
[13]	validation_0-auc:0.89832
[14]	validation_0-auc:0.90140
[15]	validation_0-auc:0.90509
[16]	validation_0-auc:0.90627
[17]	validation_0-auc:0.90793
[18]	validation_0-auc:0.90991
[19]	validation_0-auc:0.91282
[20]	validation_0-auc:0.91359
[21]	validation_0-auc:0.91383
[22]	validation_0-auc:0.91467
[23]	validation_0-auc:0.91507
[24]	validation_0-auc:0.91619
[25]	validation_0-auc:0.91635
[26]	validation_0-auc:0.91684
[27]	validation_0-auc:0.91795
[28]	validation_0-auc:0.91884
[29]	validation_0-auc:0.91969
[30]	validation_0-auc:0.92058
[31]	validation_0-auc:0.92109
[32]	validation_0-auc:0.92150
[33]	validation_0-au

In [122]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)
# Calcular métricas de evaluación
auc_train = metric(y_train, pred_train_p[:,1])
auc_val = metric(y_val, pred_val_p[:,1])
auc_test = metric(y_test, pred_test_p[:,1])
# print error
print('Metric train = %.2f - Metric val = %.2f - Metric test = %.2f'
      % (auc_train, auc_val, auc_test))

Metric train = 0.99 - Metric val = 0.96 - Metric test = 0.97


In [136]:
pred_te_tr_new_p = model.predict_proba(te_tr_new)[:, 1]

In [138]:
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save to CSV
output_df.to_csv('predicted_fraud.csv', index=False)

In [139]:
output_df.shape

(506691, 2)