# IEEE- FRAUD DETECTION USING AN ENSEMBLE OF CATBOOS AND XGBOOST

## Import packages

In [1]:
import pandas as pd
import numpy as np
from utils.data import get_data, export, balanceSample
import json
import pickle

## Load training, test datasets & artifacts

In [2]:
X_train, y_train, _, _,_ = get_data('data/train-balance.csv', hp=False)

In [3]:
df_test = pd.read_csv('data/test.csv', low_memory=False)

In [4]:
with open('artifacts/features.txt') as json_file:
        features = json.load(json_file)

In [5]:
with open('artifacts/data-proc.pkl', 'rb') as preproc_file:
    preproc = pickle.load(preproc_file)

    data_proc = preproc.train.new(df_test)
    data_proc.process()

## Train with Catboost

Since this is for final submission, we train for the entire dataset, training + validation combined

In [7]:
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [8]:
train_data = Pool(
        data=X_train,
        label=y_train,
        cat_features=features['cat'],
    )
    
test_data = Pool(
    data=data_proc.train.xs,
    label=None,
    cat_features=features['cat'],
)

In [9]:
with open('artifacts/catboost-params.txt') as json_file:
        catboost_params = json.load(json_file)

In [10]:
def trainCatboost(params,train_data, plot_status=False):
    
    model = CatBoostClassifier(**params)

    model.fit(train_data,plot=plot_status, verbose=False)

    predictions = model.predict(train_data)


    auc = roc_auc_score(train_data.get_label(), predictions)

    accuracy = accuracy_score(train_data.get_label(), predictions)

    print(f'Average AUC: {auc}, Average Accuracy: {accuracy}')
    return model

In [11]:
catboost_model = trainCatboost(catboost_params,train_data, plot_status=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Average AUC: 0.8554527143651255, Average Accuracy: 0.8929196851554867


In [12]:
catboost_test_predictions = catboost_model.predict_proba(test_data)[:,1]

## Train with Xgboost

Since this is for final submission, we train for the entire dataset, training + validation combined

In [13]:
from xgboost import XGBClassifier

Training with only important features showed no degradation in performance for Xgboost. let's use only important features to train for all the data for quick training

In [14]:
with open('artifacts/xgboost-feature-importance.txt') as json_file:
        features = json.load(json_file)

In [15]:
with open('artifacts/xgboost-params.txt') as json_file:
        xgboost_params = json.load(json_file)

In [16]:
def trainXgboost(params,X_train, y_train):
    
    model = XGBClassifier(**params)

    model.fit(X_train, y_train, verbose=False, eval_metric='auc')

    predictions = model.predict(X_train)


    auc = roc_auc_score(y_train, predictions)

    accuracy = accuracy_score(y_train, predictions)

    print(f'Average AUC: {auc}, Average Accuracy: {accuracy}')
    return model

In [18]:
xgboost_model = trainXgboost(xgboost_params, X_train[features['important']],y_train)



Average AUC: 0.9090151040329996, Average Accuracy: 0.9239985990525171


In [19]:
xgboost_test_predictions = xgboost_model.predict_proba(data_proc.train.xs[features['important']])[:,1]

## Ensemble & submission

In [20]:
df_test['isFraud'] = (catboost_test_predictions + xgboost_test_predictions) / 2

In [21]:
df_test[['TransactionID']].duplicated().any()

False

In [22]:
df_test[['TransactionID', 'isFraud']].to_csv('data/submission_ensemble_v1.csv', index=False)

In [23]:
!kaggle competitions submit -c ieee-fraud-detection -f data/submission_ensemble_v1.csv -m "Submission without any deep feature engineering. just using an ensemble of \
xgboost and catboost"

100%|██████████████████████████████████████| 13.3M/13.3M [00:09<00:00, 1.48MB/s]
Successfully submitted to IEEE-CIS Fraud Detection

In [24]:
import gc
gc.collect()

762