In [None]:
%load_ext autoreload
%autoreload 2

# Fraud Detection Modelling

Create a model that predicts whether a transaction is fraudulent or not.

In [None]:
from multiprocessing import Pool, TimeoutError
import os
import yaml

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xgboost as xgb

from fin_crime.data_processor import DataProcessor
from fin_crime.model import fit_xgb_classifier

In [None]:
pd.options.display.max_columns = 100

# Defines

In [None]:
num_cores = os.cpu_count()
input_data_dir = os.path.join("data", "processed")

In [None]:
dp = DataProcessor(
    input_data_dir
)

In [None]:
# features
x_cols_categorical = [
    "category",
    "gender",
    "trans_hour",
    "job_short",
    "merchant"
]
x_cols_cont = [
    "amt",
    "age_at_transaction",
    "city_pop",
    
]
x_cols = x_cols_categorical + x_cols_cont

# target
y_col = 'is_fraud'

# Data

In [None]:
with open(os.path.join("data", "compressed_job_titles.yml")) as f:
    job_map = yaml.load(f, Loader=yaml.FullLoader)['compressed_job_titles']

In [None]:
df_train = pd.read_parquet(
    os.path.join(input_data_dir, "tr_fincrime_train.parquet")
)

df_eval = pd.read_parquet(
    os.path.join(input_data_dir, "tr_fincrime_test.parquet")
)

In [None]:
df_train = dp.process(df_train)
df_eval = dp.process(df_eval)

In [None]:
df_train['job_short'] = df_train['job'].replace(job_map)
df_eval['job_short'] = df_eval['job'].replace(job_map)

In [None]:
for cur_df in [df_train, df_eval]:
    cur_df[x_cols_categorical] = cur_df[x_cols_categorical].astype("category")

# Models

Recall of fraudulent transactions is the key metric used to validate model performance:

- Missing a fraudulent transaction is more expensive for the business than doing a check

## Baseline Model

- XGBoost classifier with class-scaled loss
- Stratified K-Fold with 5 folds and 3 sets of hyperparameters

In [None]:
n_folds = 5

skf = StratifiedKFold(
    n_splits=n_folds, 
    shuffle=True,
    random_state=42
)

In [None]:
params_base = {
    'objective': 'binary:logistic',
    'scale_pos_weight': (df_train[y_col] == 0).sum() / (df_train[y_col] == 1).sum(),
    "learning_rate": 0.01,
}

params_low_reg = params_base | {
    "min_child_weight": 1,
    "gamma": 0.01,
    "max_depth": 15,
    "subsample": 1,
    "reg_alpha": 0.0,
    "reg_lambda": 0.0
}

params_med_reg = params_base | {
    "min_child_weight": 10,
    "gamma": 0.1,
    "max_depth": 10,
    "subsample": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1
}

params_high_reg = params_base | {
    "min_child_weight": 25,
    "gamma": 0.5,
    "max_depth": 8,
    "subsample": 0.5,
    "reg_alpha": 0.5,
    "reg_lambda": 0.5
}

params = {
    "params_low_reg": params_low_reg,
    "params_med_reg": params_med_reg,
    "params_high_reg": params_high_reg,
}

num_boost_round = 1000
early_stopping_rounds = 50
fit_timeout = 600

In [None]:
# Iterating over hyperparameters
out_dict = {}

for cur_name, cur_params in tqdm(params.items()):
    skf_gen = skf.split(df_train[x_cols], df_train[y_col])

    with Pool(processes=min(num_cores - 1, n_folds)) as pool:
        out = pool.starmap_async(
            fit_xgb_classifier, 
            [
                (cur_params, df_train, x_cols, y_col, train_idx, test_idx, num_boost_round, early_stopping_rounds) 
                for _, (train_idx, test_idx) in enumerate(skf_gen)
            ]
        )
    
        try:
            result = out.get(timeout=fit_timeout)
        except TimeoutError:
            print("Timeout")
            result = None

    out_dict[cur_name] = result

In [None]:
# Preparing Metrics

metric_list = []

for cur_name, cur_out in out_dict.items():
    for cur_fold, _ in enumerate(cur_out):
        
        df_metrics_train = pd.DataFrame(
            cur_out[cur_fold][0]['train']
        )
        
        df_metrics_eval = pd.DataFrame(
            cur_out[cur_fold][0]['eval']
        )
        
        df_metrics_train['step'] = df_metrics_train.index.values
        df_metrics_eval['step'] = df_metrics_eval.index.values
        
        df_metrics_train['type'] = 'train'
        df_metrics_eval['type'] = 'eval'
        
        df_metrics = pd.concat([df_metrics_train, df_metrics_eval], ignore_index=True)
        df_metrics['fold'] = cur_fold
        df_metrics['param'] = cur_name

        metric_list.append(df_metrics)

In [None]:
df_metrics_all = pd.concat(metric_list)

df_metrics_agg = df_metrics_all.groupby(
    ['param', 'type', 'step'],
)[['f1', 'precision_class_1', 'recall_class_1']].mean()

df_metrics_agg = df_metrics_agg.join(
    df_metrics_all.groupby(
        ['param', 'type', 'step'],
    )['recall_class_1'].size().to_frame('count')
)

df_metrics_agg = df_metrics_agg.loc[
    df_metrics_agg['count'] == n_folds
]

Different levels of regularisation There is a trade-off between precision and recall which we have to make - `params_med_reg` set of hyperparameters (middle of the road amount of regularisation) could be 

In [None]:
df_plot = df_metrics_agg.reset_index().melt(
    id_vars=['param', 'type', 'step'],
    var_name='metric',
    value_vars=['f1', 'precision_class_1', 'recall_class_1']
)

sns.relplot(
    data=df_plot[df_plot['type'] != 'train'],
    x='step',
    y='value',
    hue='param',
    col='metric',
    kind='line',
    facet_kws={"sharey": False}
)

In [None]:
df_plot = df_metrics_agg.reset_index().melt(
    id_vars=['param', 'type', 'step'],
    var_name='metric',
    value_vars=['f1', 'precision_class_1', 'recall_class_1']
)

sns.relplot(
    data=df_plot[df_plot['metric'] == 'recall_class_1'],
    x='step',
    y='value',
    hue='type',
    col='param',
    kind='line',
    facet_kws={"sharey": True}
)

plt.suptitle('Fraud Recall Bias', y=1.02)