# Packages

In [1]:
! pip install pandas==1.1.5
! pip install matplotlib==3.3.3
! pip install scikit-learn==0.23.2
! pip install xgboost==1.3.1
! pip install catboost==0.24.4
! pip install scikit-optimize==0.8.1



# Data

Import libraries:

In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import xgboost as xgb
import gc

Import data & combine transaction and idendity columns:

In [3]:
train_transaction = pd.read_csv('/home/fatihakyon/dev/inzva/ieee-fraud-detection/data/train_transaction.csv')
train_identity = pd.read_csv('/home/fatihakyon/dev/inzva/ieee-fraud-detection/data/train_identity.csv')

train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")
train = train.set_index("TransactionID", drop="True")
del train_transaction, train_identity
gc.collect()

test_transaction = pd.read_csv('/home/fatihakyon/dev/inzva/ieee-fraud-detection/data/test_transaction.csv')
test_identity = pd.read_csv('/home/fatihakyon/dev/inzva/ieee-fraud-detection/data/test_identity.csv')

test = pd.merge(test_transaction, test_identity, on="TransactionID", how="left")
test = test.set_index("TransactionID", drop="True")
del test_transaction, test_identity
gc.collect()

0

In [4]:
train.shape

(590540, 433)

In [5]:
test.shape

(506691, 432)

In [6]:
train.head()

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


Rename test data columns:

In [7]:
mapping = {}
for column_name in test.columns:
    mapping[column_name] = column_name.replace("-", "_")
test.rename(columns=mapping, inplace=True)

In [8]:
test.columns

Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=432)

Reduce memory usage:

In [9]:
import numpy as np


def reduce_memory_usage(df):
    """
    From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

    Iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass
            # df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

Memory usage of dataframe is 1955.37 MB
Memory usage after optimization is: 648.22 MB
Decreased by 66.8%
Memory usage of dataframe is 1673.87 MB
Memory usage after optimization is: 563.43 MB
Decreased by 66.3%


Split train into train/val & fill nans:

In [10]:
VAL_SPLIT = 0.2
# Split train into train/val
y = train["isFraud"].copy()
X = train.drop("isFraud", axis=1)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=VAL_SPLIT, random_state=13
)

X_test = test.copy()
del train, test, X, y


X_train = X_train.fillna(-999)
X_val = X_val.fillna(-999)
X_test = X_test.fillna(-999)

Label encoding:

In [12]:
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values) + list(X_val[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_val[f] = lbl.transform(list(X_val[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values)) 

# XGBoost

In [13]:
clf = xgb.XGBClassifier(n_estimators=200,
                        n_jobs=4,
                        max_depth=9,
                        learning_rate=0.05,
                        subsample=0.9,
                        colsample_bytree=0.9,
                        tree_method='gpu_hist',
                        missing=-999,
                        use_label_encoder=False)

In [14]:
clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=-999, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

Results:

In [15]:
def calculate_scores(estimator, X_val, y_val):
    y_val_prediction = estimator.predict(X_val)
    y_val_proba = estimator.predict_proba(X_val)[:, 1]

    conf_matrix = metrics.confusion_matrix(y_val, y_val_prediction)
    accuracy_score = metrics.accuracy_score(y_val, y_val_prediction)
    roc_auc_score = metrics.roc_auc_score(y_val, y_val_proba)
    f1_score = metrics.f1_score(y_val, y_val_prediction)
    classification_report = metrics.classification_report(y_val, y_val_prediction)

    print("Confusion Matrix: \n%s" % str(conf_matrix))
    print("\nAccuracy: %.4f" % accuracy_score)
    print("\nAUC: %.4f" % roc_auc_score)
    print("\nF1 Score: %.4f" % f1_score)
    print("\nClassification Report: \n",classification_report)

    return {
        "conf_matrix": conf_matrix,
        "accuracy_score": accuracy_score,
        "roc_auc_score": roc_auc_score,
        "f1_score": f1_score,
        "classification_report": classification_report
    }

In [16]:
_ = calculate_scores(clf, X_val, y_val)

Confusion Matrix: 
[[113852    120]
 [  2106   2030]]

Accuracy: 0.9812

AUC: 0.9505

F1 Score: 0.6459

Classification Report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99    113972
           1       0.94      0.49      0.65      4136

    accuracy                           0.98    118108
   macro avg       0.96      0.74      0.82    118108
weighted avg       0.98      0.98      0.98    118108



# Hyperarameter Optimization

Randomized Search:

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt  

param_grid = {
    'silent': [False],
    'max_depth': sp_randInt(6, 20),
    'learning_rate': sp_randFloat(),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    'gamma': [0, 0.25, 0.5, 1.0],
    'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    'n_estimators': [200]
}

def perform_random_search(
    estimator, X_train, X_val, y_train, y_val, param_grid, scoring=None
):
    hyperparam_optimizer = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_grid,
        scoring=scoring,
        cv=2,
        n_iter=20,
        n_jobs=1,
        refit=True,
        random_state=13,
    )
    hyperparam_optimizer.fit(X_train, y_train, eval_set=[[X_val, y_val]])

    return hyperparam_optimizer.best_estimator_

In [18]:
# accuracy score
best_estimator = perform_random_search(clf, X_train, X_val, y_train, y_val, param_grid, scoring='accuracy')
_ = calculate_scores(best_estimator, X_val, y_val)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-logloss:0.22140
[1]	validation_0-logloss:0.13731
[2]	validation_0-logloss:0.10988
[3]	validation_0-logloss:0.09943
[4]	validation_0-logloss:0.09583
[5]	validation_0-logloss:0.09323
[6]	validation_0-logloss:0.09126
[7]	validation_0-logloss:0.09065
[8]	validation_0-logloss:0.08934
[9]	validation_0-logloss:0.08810
[10]	validation_0-logloss:0.08764
[11]	validation_0-logloss:0.08688
[12]	validation_0-logloss:0.08687
[13]	validation_0-logloss:0.08640
[14]	validation_0-logloss:0.08616
[15]	validation_0-logloss:0.08567
[16]	validation_0-logloss:0.08560
[17]	validation_0-logloss:0.08449
[18]	validation_0-logloss:0.08461
[19]	validation_0-logloss:0.08397
[20]	validation_0-logloss:0.08396
[21]	validatio

In [19]:
# roc auc score
best_estimator = perform_random_search(clf, X_train, X_val, y_train, y_val, param_grid, scoring='roc_auc')
_ = calculate_scores(best_estimator, X_val, y_val)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-logloss:0.22140
[1]	validation_0-logloss:0.13731
[2]	validation_0-logloss:0.10988
[3]	validation_0-logloss:0.09943
[4]	validation_0-logloss:0.09583
[5]	validation_0-logloss:0.09323
[6]	validation_0-logloss:0.09126
[7]	validation_0-logloss:0.09065
[8]	validation_0-logloss:0.08934
[9]	validation_0-logloss:0.08810
[10]	validation_0-logloss:0.08764
[11]	validation_0-logloss:0.08688
[12]	validation_0-logloss:0.08687
[13]	validation_0-logloss:0.08640
[14]	validation_0-logloss:0.08616
[15]	validation_0-logloss:0.08567
[16]	validation_0-logloss:0.08560
[17]	validation_0-logloss:0.08449
[18]	validation_0-logloss:0.08461
[19]	validation_0-logloss:0.08397
[20]	validation_0-logloss:0.08396
[21]	validatio

In [20]:
# f1 score
best_estimator = perform_random_search(clf, X_train, X_val, y_train, y_val, param_grid, scoring='f1')
_ = calculate_scores(best_estimator, X_val, y_val)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-logloss:0.22140
[1]	validation_0-logloss:0.13731
[2]	validation_0-logloss:0.10988
[3]	validation_0-logloss:0.09943
[4]	validation_0-logloss:0.09583
[5]	validation_0-logloss:0.09323
[6]	validation_0-logloss:0.09126
[7]	validation_0-logloss:0.09065
[8]	validation_0-logloss:0.08934
[9]	validation_0-logloss:0.08810
[10]	validation_0-logloss:0.08764
[11]	validation_0-logloss:0.08688
[12]	validation_0-logloss:0.08687
[13]	validation_0-logloss:0.08640
[14]	validation_0-logloss:0.08616
[15]	validation_0-logloss:0.08567
[16]	validation_0-logloss:0.08560
[17]	validation_0-logloss:0.08449
[18]	validation_0-logloss:0.08461
[19]	validation_0-logloss:0.08397
[20]	validation_0-logloss:0.08396
[21]	validatio

Bayesian Search:

In [26]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

param_grid = {
    'silent': [False],
    'max_depth': Integer(6, 20),
    'learning_rate': Real(0.01, 0.3),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    'gamma': [0, 0.25, 0.5, 1.0],
    'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    'n_estimators': [200]
}

def perform_bayes_search(
    estimator, X_train, X_val, y_train, y_val, param_grid, scoring=None
):
    hyperparam_optimizer = BayesSearchCV(
        estimator=estimator,
        search_spaces=param_grid,
        scoring=scoring,
        cv=2,
        n_iter=20,
        n_jobs=1,
        refit=True,
        return_train_score=False,
        optimizer_kwargs={"base_estimator": "GP"},
        random_state=13,
        fit_params={
                'eval_set': [[X_val, y_val]],
        }
    )
    hyperparam_optimizer.fit(X_train, y_train)

    return hyperparam_optimizer.best_estimator_

In [27]:
# accuracy score
best_estimator = perform_bayes_search(clf, X_train, X_val, y_train, y_val, param_grid, scoring='accuracy')
_ = calculate_scores(best_estimator, X_val, y_val)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-logloss:0.67952
[1]	validation_0-logloss:0.66629
[2]	validation_0-logloss:0.65346
[3]	validation_0-logloss:0.64102
[4]	validation_0-logloss:0.62893
[5]	validation_0-logloss:0.61721
[6]	validation_0-logloss:0.60581
[7]	validation_0-logloss:0.59472
[8]	validation_0-logloss:0.58395
[9]	validation_0-logloss:0.57348
[10]	validation_0-logloss:0.56329
[11]	validation_0-logloss:0.55337
[12]	validation_0-logloss:0.54372
[13]	validation_0-logloss:0.53429
[14]	validation_0-logloss:0.52512
[15]	validation_0-logloss:0.51618
[16]	validation_0-logloss:0.50746
[17]	validation_0-logloss:0.49897
[18]	validation_0-logloss:0.49069
[19]	validation_0-logloss:0.48260
[20]	validation_0-logloss:0.47472
[21]	validatio

In [28]:
# roc auc score
best_estimator = perform_bayes_search(clf, X_train, X_val, y_train, y_val, param_grid, scoring='roc_auc')
_ = calculate_scores(best_estimator, X_val, y_val)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-logloss:0.67952
[1]	validation_0-logloss:0.66629
[2]	validation_0-logloss:0.65346
[3]	validation_0-logloss:0.64102
[4]	validation_0-logloss:0.62893
[5]	validation_0-logloss:0.61721
[6]	validation_0-logloss:0.60581
[7]	validation_0-logloss:0.59472
[8]	validation_0-logloss:0.58395
[9]	validation_0-logloss:0.57348
[10]	validation_0-logloss:0.56329
[11]	validation_0-logloss:0.55337
[12]	validation_0-logloss:0.54372
[13]	validation_0-logloss:0.53429
[14]	validation_0-logloss:0.52512
[15]	validation_0-logloss:0.51618
[16]	validation_0-logloss:0.50746
[17]	validation_0-logloss:0.49897
[18]	validation_0-logloss:0.49069
[19]	validation_0-logloss:0.48260
[20]	validation_0-logloss:0.47472
[21]	validatio

In [29]:
# f1 score
best_estimator = perform_bayes_search(clf, X_train, X_val, y_train, y_val, param_grid, scoring='f1')
_ = calculate_scores(best_estimator, X_val, y_val)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-logloss:0.67952
[1]	validation_0-logloss:0.66629
[2]	validation_0-logloss:0.65346
[3]	validation_0-logloss:0.64102
[4]	validation_0-logloss:0.62893
[5]	validation_0-logloss:0.61721
[6]	validation_0-logloss:0.60581
[7]	validation_0-logloss:0.59472
[8]	validation_0-logloss:0.58395
[9]	validation_0-logloss:0.57348
[10]	validation_0-logloss:0.56329
[11]	validation_0-logloss:0.55337
[12]	validation_0-logloss:0.54372
[13]	validation_0-logloss:0.53429
[14]	validation_0-logloss:0.52512
[15]	validation_0-logloss:0.51618
[16]	validation_0-logloss:0.50746
[17]	validation_0-logloss:0.49897
[18]	validation_0-logloss:0.49069
[19]	validation_0-logloss:0.48260
[20]	validation_0-logloss:0.47472
[21]	validatio