### Required Imports

In [1]:
import numpy as np
import pandas as pd 
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns               
%matplotlib inline
sns.set()

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

### Initial Data Cleaning & Splitting

In [111]:

train_data = pd.read_csv('../scotia_data/train_ScotiaDSD.csv')

l_e = LabelEncoder()

# translating time to a continuous variable
train_data['EVENT_TIME'] = train_data['EVENT_TIME'].apply(lambda x: (int(x.split(':')[0]) * 60) + int(x.split(':')[1]))

# allowing for the use of sparse variables in cases where it is provided
train_data['USER_AGENT'].fillna("unk", inplace=True)
train_data['USER_AGENT'] = l_e.fit_transform(train_data['USER_AGENT'])
train_data['CITY'].fillna("unk", inplace=True)
train_data['CITY'] = l_e.fit_transform(train_data['CITY'])

# want the label to be int, not float
train_data['FRAUD_FLAG'] = train_data['FRAUD_FLAG'].astype(int)
train_data.drop(['TRANSACTION_ID'], axis=1, inplace=True)

In [112]:
# trying to set all categorical variables
categorical_vars = ['CITY', 'USER_AGENT']
for col in train_data.columns:
    # all flags should be considered categorical
    if 'FLAG' in col:
        categorical_vars.append(col)

    # all one-hot features should be considered categorical
    elif len(train_data[col].unique()) == 2:
        categorical_vars.append(col)
    
train_data = train_data.loc[:, categorical_vars].astype("category")
data_X = train_data.loc[:, train_data.columns != 'FRAUD_FLAG']
data_y = train_data['FRAUD_FLAG']

In [113]:
# stratified train-test split
train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.2, random_state=42, stratify=data_y)

### Testing Data Segmentation

In [87]:
# segment on card present flag (separate physical & online/phone purchases)
online_purchases = train_data[train_data['CARD_NOT_PRESENT'] == 1]
online_purchases_X = online_purchases.loc[:, online_purchases.columns != 'FRAUD_FLAG']
online_purchases_y = online_purchases['FRAUD_FLAG']

# stratified train-test split
online_train_X, online_test_X, online_train_y, online_test_y = train_test_split(online_purchases_X, online_purchases_y, test_size=0.2, random_state=42, stratify=online_purchases_y)

### Training XGBoost on the Entire Dataset

In [116]:
xgb_model = xgb.XGBClassifier(tree_method="hist", max_depth=3, n_estimators=800, random_state=42, enable_categorical=True)

xgb_model.fit(train_X, train_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=800, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

In [117]:
y_pred = xgb_model.predict(test_X)
y_probs = xgb_model.predict_proba(test_X)
print(f1_score(test_y, y_pred))

0.16153846153846155
category


In [None]:
import warnings

pd.set_option('display.max_rows', 1000)
# pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")