In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import joblib

In [2]:
tr_tr_new = joblib.load('../joblib/tr_tr_new.joblib')

In [3]:
te_tr_new = joblib.load('../joblib/te_tr_new.joblib')

In [4]:
import numpy as np
from catboost import CatBoostClassifier, Pool

In [None]:
def optimize_dtypes(df):
    for col in df.columns:
        col_dtype = df[col].dtype
        if col_dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_dtype == 'int64' or col_dtype == 'int32':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

# Apply the optimization function on your datasets
tr_tr_new = optimize_dtypes(tr_tr_new)
te_tr_new = optimize_dtypes(te_tr_new)

In [5]:
# Identify categorical features. CatBoost can handle them natively.
cat_var = ['ProductCD',
           'card6', 'card2', 'card5', 'card1', 'card4', 'card3',
           'addr2', 'addr1',
           'P_emaildomain',
           'R_emaildomain',
           'M6', 'M7', 'M8', 'M4', 'M9', 'M1', 'M2', 'M5', 'M3',
           'DeviceType',
           'DeviceInfo',
           'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19',
           'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27',
           'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35',
           'id_36', 'id_37', 'id_38']

In [6]:
te_tr_new.rename(columns=lambda col: col.replace('id-', 'id_'), inplace=True)

In [7]:
for col in cat_var:
    tr_tr_new[col] = tr_tr_new[col].astype(str)
    te_tr_new[col] = te_tr_new[col].astype(str)

In [8]:
from sklearn.model_selection import train_test_split
X = tr_tr_new.drop('isFraud', axis=1)
y = tr_tr_new['isFraud']

In [9]:
# Split the data into training and temp sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temp data into validation and test sets (50% validation, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 472432
Validation set size: 59054
Test set size: 59054


In [66]:
# Initialize CatBoostClassifier
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.07,
    depth=7,
    cat_features=cat_var,
    eval_metric='AUC',  # Changed metric to AUC
    random_seed=42,
    loss_function='Logloss',
    metric_period=500,
    task_type='GPU'
)

In [68]:
import warnings

warnings.filterwarnings('ignore')
model.fit(
    X_train, y_train,
    cat_features=cat_var,
    eval_set=(X_val, y_val),
    logging_level='Verbose',  # Output to stdout
    plot=True  # Plotting training and validation error (if you run this notebook locally)
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7390227	best: 0.7390227 (0)	total: 298ms	remaining: 2m 28s
499:	test: 0.9583119	best: 0.9583119 (499)	total: 2m 3s	remaining: 0us
bestTest = 0.9583119154
bestIteration = 499


<catboost.core.CatBoostClassifier at 0x230ec449950>

In [69]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for the positive class
preds_proba = model.predict_proba(X_test)[:, 1]

# Evaluate model using AUC
auc_score = roc_auc_score(y_test, preds_proba)
print("AUC:", auc_score)

AUC: 0.9628169734061687


In [72]:
pred_te_tr_new_p = model.predict_proba(te_tr_new)[:, 1]

# Create the output DataFrame
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save the DataFrame to a CSV file
output_df.to_csv('predicted_fraud_catboost.csv', index=False)


In [74]:
joblib.dump(model, '../models_libjob/cb[0.9316][0.9087].joblib')

['../models_libjob/cb[0.9316][0.9087].joblib']

In [10]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import itertools

params_grid = {
    'iterations': [2000],
    'learning_rate': [0.07],
    'depth': [7],
    'loss_function': ['Logloss'],
    'metric_period': [500]
}

# Convert all categorical features to string type
for col in cat_var:
    X[col] = X[col].astype(str)

# Initialize a DataFrame to store results
cv_results = pd.DataFrame(columns=['iterations', 'learning_rate', 'depth','loss_function', 'metric_period','mean_auc'])

kf = KFold(n_splits=5, shuffle=True, random_state=42)
num_iter = 0

for params in itertools.product(*params_grid.values()):
    # Unpack the current parameter set
    iterations, learning_rate, depth, loss_function, metric_period = params
    
    aucs = []
    for train_index, val_index in kf.split(X):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
        
        model = CatBoostClassifier(
            iterations=iterations,
            learning_rate=learning_rate,
            depth=depth,
            eval_metric='AUC',
            task_type='GPU',
            cat_features=cat_var,
            random_seed=42,
            verbose=True
        )
        
        model.fit(X_train_fold, y_train_fold)
        preds_proba = model.predict_proba(X_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, preds_proba)
        aucs.append(fold_auc)
    
    mean_auc = sum(aucs) / len(aucs)
    cv_results.loc[num_iter] = [iterations, learning_rate, depth, loss_function, metric_period, mean_auc]
    num_iter += 1

print(cv_results)


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


   iterations  learning_rate  depth loss_function  metric_period  mean_auc
0        2000           0.07      7       Logloss            500  0.970661


In [11]:
best_params = cv_results.sort_values(by='mean_auc', ascending=False).iloc[0]

print(best_params)

iterations           2000
learning_rate        0.07
depth                   7
loss_function     Logloss
metric_period         500
mean_auc         0.970661
Name: 0, dtype: object


In [12]:
best_params['depth']

7

In [13]:
best_model = CatBoostClassifier(
    iterations=int(best_params['iterations']),
    learning_rate=float(best_params['learning_rate']),
    depth=int(best_params['depth']),
    cat_features=cat_var,
    eval_metric='AUC',  # Changed metric to AUC
    random_seed=42,
    loss_function=best_params['loss_function'],
    metric_period=int(best_params['metric_period']),
    task_type='GPU'
)

In [14]:
best_model.fit(
    X_train, y_train,
    cat_features=cat_var,
    eval_set=(X_val, y_val),
    logging_level='Verbose',  # Output to stdout
    plot=True  # Plotting training and validation error (if you run this notebook locally)
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7390227	best: 0.7390227 (0)	total: 370ms	remaining: 12m 19s
500:	test: 0.9589244	best: 0.9589244 (500)	total: 2m 13s	remaining: 6m 38s
1000:	test: 0.9638178	best: 0.9638178 (1000)	total: 4m 26s	remaining: 4m 26s
1500:	test: 0.9658615	best: 0.9658615 (1500)	total: 6m 38s	remaining: 2m 12s
1999:	test: 0.9676477	best: 0.9676477 (1999)	total: 8m 48s	remaining: 0us
bestTest = 0.9676477313
bestIteration = 1999


<catboost.core.CatBoostClassifier at 0x23d9ffe8210>

In [17]:
joblib.dump(best_model, '../models_libjob/cb[0.9727].joblib')

['../models_libjob/cb[0.9727].joblib']

In [18]:
pred_train_p = model.predict_proba(X_train)
pred_val_p = model.predict_proba(X_val)
pred_test_p = model.predict_proba(X_test)
# Calcular métricas de evaluación
auc_train = roc_auc_score(y_train, pred_train_p[:,1])
auc_val = roc_auc_score(y_val, pred_val_p[:,1])
auc_test = roc_auc_score(y_test, pred_test_p[:,1])
# print error
print('Metric train = %.4f - Metric val = %.4f - Metric test = %.4f'
      % (auc_train, auc_val, auc_test))

Metric train = 0.9841 - Metric val = 0.9891 - Metric test = 0.9900


In [None]:
joblib.dump(best_model, '../models_libjob/cb[0.9900].joblib')

In [20]:
pred_te_tr_new_p = model.predict_proba(te_tr_new)[:, 1]

# Create the output DataFrame
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save the DataFrame to a CSV file
output_df.to_csv('predicted_fraud_catboost[5].csv', index=False)


In [None]:
joblib.dump(best_model, '../models_libjob/cb[0.9316][0.9087].joblib')