In [8]:
from itertools import product
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

from lib.creditcard_fraud_dataset import get_train_test_dfs, X_COLS
from lib.cs_eval import evaluate_pred

# 7. Cross-validation

In the previous notebooks, we have seen multiple approaches to make a classifier cost-sensitive. Which one should we choose? To make an informed decision, we can employ **K-fold cross-validation**. The idea is to split up the training dataset into $K$ equally-sized partitions (*folds*) and to train each model $K$ times, each time with a different combination of $K - 1$ folds for training and $1$ fold for validation (evaluation). This is shown in the following figure:

<img style="width: 50%; margin: auto" src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png"/>

The method that achieves the highest average performance can then be seen as the *best* one for the given data.

## 6.1 Define function for each training method

### 6.1.1 Baseline

Logistic regression on the entire dataset, without any modifications.

In [1]:
def train_baseline(df_train):
    X_train = df_train[X_COLS]
    y_train = df_train['Class']

    clf = LogisticRegression(max_iter=500)

    clf.fit(X_train, y_train)

    return clf

### 6.1.2 Sample weighting

Samples with a **higher misclassification cost contribute more** to the loss than samples with a lower misclassification cost.

See [1_sample_weighting.ipynb](1_sample_weighting.ipynb).

In [2]:
def train_sample_weighted(df_train):
    X_train = df_train[X_COLS]
    y_train = df_train['Class']

    clf_weighted = LogisticRegression(max_iter=500)

    clf_weighted.fit(X_train, y_train, sample_weight=df_train['C_misclf'])

    return clf_weighted

### 6.1.3 Subsampling

See [2_subsampling.ipynb](2_subsampling.ipynb).

In [3]:
def train_subsampled(df_train):
    X_train = df_train[X_COLS]
    y_train = df_train['Class']

    is_fraud = y_train == 1
    X_fraud = X_train[is_fraud]

    num_fraud = y_train.sum()

    no_fraud = y_train == 0
    X_no_fraud = X_train[no_fraud]
    X_no_fraud = X_no_fraud.sample(n=num_fraud)

    X_balanced = pd.concat([X_fraud, X_no_fraud])
    y_balanced = np.concatenate([np.ones(len(X_fraud)),
                                 np.zeros(len(X_no_fraud))])

    clf_balanced = LogisticRegression(max_iter=500)
    clf_balanced.fit(X_balanced, y_balanced)
    return clf_balanced

### 6.1.4 Cost-sensitive sampling

See [3_cost_sensitive_sampling.ipynb](3_cost_sensitive_sampling.ipynb).

In [4]:
def get_cost_sensitive_sampling_ratios(df_train):
    n_pos = df_train['Class'].sum()
    n_neg = len(df_train) - n_pos

    return (df_train['C_FN'] / df_train['C_FP']) * (n_pos / n_neg)


def train_cs_sampling(df_train):
    X_train = df_train[X_COLS]
    y_train = df_train['Class']

    is_fraud = y_train == 1
    X_fraud = X_train[is_fraud]

    # Compute cost-sensitive positive-negative ratio
    record_spec_r_cs = get_cost_sensitive_sampling_ratios(df_train)
    global_r_cs = record_spec_r_cs.mean()

    # Compute number of non-fraudulent transactions to sample
    num_fraud = len(X_fraud)
    num_no_fraud_sample = int(num_fraud / global_r_cs)

    # Define X_no_fraud by sampling from all non-fraudulent rows
    no_fraud = df_train['Class'] == 0
    X_no_fraud = X_train[no_fraud]
    X_no_fraud = X_no_fraud.sample(n=num_no_fraud_sample)

    X_cs = pd.concat([X_fraud, X_no_fraud])
    y_cs = np.concatenate([np.ones(len(X_fraud)),
                           np.zeros(len(X_no_fraud))])

    # Train the classifier
    clf_cs = LogisticRegression(max_iter=500)
    clf_cs.fit(X_cs, y_cs)

    return clf_cs

### 6.1.5 Cost-sensitive threshold

See [4_cost_sensitive_threshold.ipynb](4_cost_sensitive_threshold.ipynb).

In [5]:
def get_y_pred_cs_threshold(clf, df_val):
    X_val = df_val[X_COLS]

    # Get probability estimates
    y_proba = clf.predict_proba(X_val)

    # Classification with cost-sensitive threshold
    cs_thresh = df_val['C_FP'] / (df_val['C_FP'] + df_val['C_FN'])
    y_pred = y_proba[:, 1] > cs_thresh

    return y_pred

### 6.1.6 AdaBoost with cost-sensitive weight initialization

See [5_adaboost_weight_init.ipynb](5_adaboost_weight_init.ipynb).

In [6]:
def train_unweighted_adaboost(df_train):
    X_train = df_train[X_COLS]
    y_train = df_train['Class']

    clf_ada_unweighted = AdaBoostClassifier(n_estimators=10)
    clf_ada_unweighted.fit(X_train, y_train)

    return clf_ada_unweighted


def train_weighted_adaboost(df_train):
    X_train = df_train[X_COLS]
    y_train = df_train['Class']

    clf_ada_weighted = AdaBoostClassifier(n_estimators=10)
    clf_ada_weighted.fit(X_train, y_train, sample_weight=df_train['C_misclf'])

    return clf_ada_weighted

## 6.2 Run cross-validation

### 6.2.1 Create folds

We will run a 5-fold cross-validation. To create the folds, we can use [`StratifiedKFold` of SciKit-Learn](https://scikit-learn.org/stable/modules/cross_validation.html#stratified-k-fold). With `StratifiedKFold`, each split is guaranteed to contain a comparable distribution of fraudulent and non-fraudulent transactions.

In [9]:
skf = StratifiedKFold(n_splits=5)

On a `StratifiedKFold` object, you can call [the method `split()`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold.split) to generate the splits for a given dataset.

In [10]:
df_trainval, df_test = get_train_test_dfs()

X_trainval = df_trainval[X_COLS]
y_trainval = df_trainval['Class']

split_idxs = skf.split(X_trainval, y_trainval)

### 6.2.2  Loop over splits

In each iteration, we use $4$ folds for training with each of the above training methods, and the other fold for validation. For each validation result, we append a new Python dictionary to the list `results`.

Note that `split()` returns the *indices* of the splits, not the samples themselves. In the `for`-loop below, you can indeed see that we use the `train_index` and `val_index` to select rows from `df_trainval`.

In [11]:
results = []

for train_index, val_index in split_idxs:
    # Create train and val set from given indices
    df_train = df_trainval.iloc[train_index]
    df_val = df_trainval.iloc[val_index]

    X_val = df_val[X_COLS]
    y_val = df_val['Class']
    amounts = df_val['Amount']

    # 0. Logistic regression on unmodified dataset
    print('0. Baseline...')
    clf_baseline = train_baseline(df_train)
    y_pred_baseline = clf_baseline.predict(X_val)
    eval_baseline = evaluate_pred(y_val, y_pred_baseline, amounts)
    results.append({
        'Method': 'Baseline',
        **eval_baseline
    })

    # 1. Sample weighting
    print('1. Sample weighting...')
    clf_weighted = train_sample_weighted(df_train)
    y_pred_weighted = clf_weighted.predict(X_val)
    eval_weighted = evaluate_pred(y_val, y_pred_weighted, amounts)
    results.append({
        'Method': 'Sample weighting',
        **eval_weighted
    })

    # 2. Subsampling
    print('2. Subsampling...')
    clf_subsamp = train_subsampled(df_train)
    y_pred_subsamp = clf_subsamp.predict(X_val)
    eval_subsamp = evaluate_pred(y_val, y_pred_subsamp, amounts)
    results.append({
        'Method': 'Subsampling',
        **eval_subsamp
    })

    # 3. Cost-sensitive sampling
    print('3. Cost-sensitive sampling')
    clf_cs_samp = train_cs_sampling(df_train)
    y_pred_cs_samp = clf_cs_samp.predict(X_val)
    eval_cs_samp = evaluate_pred(y_val, y_pred_cs_samp, amounts)
    results.append({
        'Method': 'Cost-sensitive sampling',
        **eval_cs_samp
    })

    # 4. Cost-sensitive threhold
    print('4. Cost-sensitive threhold')
    y_pred_cs_threshold = get_y_pred_cs_threshold(clf_baseline, df_val)
    eval_cs_threshold = evaluate_pred(y_val, y_pred_cs_threshold, amounts)
    results.append({
        'Method': 'Cost-sensitive threhold',
        **eval_cs_threshold
    })

    # 5a. AdaBoost, unweighted
    print('5a. AdaBoost, unweighted...')
    clf_unwght_ada = train_unweighted_adaboost(df_train)
    y_pred_unwght_ada = clf_unwght_ada.predict(X_val)
    eval_unwght_ada = evaluate_pred(y_val, y_pred_unwght_ada, amounts)
    results.append({
        'Method': 'AdaBoost (unweighted)',
        **eval_unwght_ada
    })

    # 5b. AdaBoost, weighted
    print('5b. AdaBoost, weighted...')
    clf_wght_ada = train_weighted_adaboost(df_train)
    y_pred_wght_ada = clf_wght_ada.predict(X_val)
    eval_wght_ada = evaluate_pred(y_val, y_pred_wght_ada, amounts)
    results.append({
        'Method': 'AdaBoost (weighted)',
        **eval_wght_ada
    })

    print()

0. Baseline...
1. Sample weighting...
2. Subsampling...
3. Cost-sensitive sampling
4. Cost-sensitive threhold
5a. AdaBoost, unweighted...
5b. AdaBoost, weighted...

0. Baseline...
1. Sample weighting...
2. Subsampling...
3. Cost-sensitive sampling
4. Cost-sensitive threhold
5a. AdaBoost, unweighted...
5b. AdaBoost, weighted...

0. Baseline...
1. Sample weighting...
2. Subsampling...
3. Cost-sensitive sampling
4. Cost-sensitive threhold
5a. AdaBoost, unweighted...
5b. AdaBoost, weighted...

0. Baseline...
1. Sample weighting...
2. Subsampling...
3. Cost-sensitive sampling
4. Cost-sensitive threhold
5a. AdaBoost, unweighted...
5b. AdaBoost, weighted...

0. Baseline...
1. Sample weighting...
2. Subsampling...
3. Cost-sensitive sampling
4. Cost-sensitive threhold
5a. AdaBoost, unweighted...
5b. AdaBoost, weighted...



## 6.3 Inspect the results

We start by converting the list of dictionaries `results` into a pandas DataFrame.

In [12]:
df_results = pd.DataFrame(results)

To obtain a single metric, we can compute the *F1-score*. This is the harmonic mean between the precision and recall.

In [13]:
df_results['Cost F1'] = (
    2 * df_results['Cost Precision'] * df_results['Cost Recall']
    / (df_results['Cost Precision'] + df_results['Cost Recall'])
)

df_results

Unnamed: 0,Method,Cost Precision,Cost Recall,TP Amount,FP Amount,FN Amount,Net Recovered Amount,Cost F1
0,Baseline,0.996606,0.535089,2935.97,10.0,2550.91,375.06,0.696317
1,Sample weighting,0.953153,0.734204,4028.49,198.0,1458.39,2372.1,0.829473
2,Subsampling,0.738263,0.880083,4828.91,1712.0,657.97,2458.94,0.802959
3,Cost-sensitive sampling,0.978147,0.734204,4028.49,90.0,1458.39,2480.1,0.8388
4,Cost-sensitive threhold,0.96116,0.72162,3959.44,160.0,1527.44,2272.0,0.824341
5,AdaBoost (unweighted),0.99507,0.441464,2422.26,12.0,3064.62,-654.36,0.611594
6,AdaBoost (weighted),0.948552,0.732529,4019.3,218.0,1467.58,2333.72,0.826661
7,Baseline,0.997314,0.18685,1485.3,4.0,6463.87,-4982.57,0.314733
8,Sample weighting,0.966376,0.838821,6667.93,232.0,1281.24,5154.69,0.898092
9,Subsampling,0.71476,0.645593,5131.93,2048.0,2817.24,266.69,0.678418


To obtain a single F1-score per training method, we can [group by](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html) the `'Method'` column and compute the mean per group:

In [14]:
df_agg = df_results.groupby('Method').mean()
df_agg

Unnamed: 0_level_0,Cost Precision,Cost Recall,TP Amount,FP Amount,FN Amount,Net Recovered Amount,Cost F1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AdaBoost (unweighted),0.995531,0.612668,3474.124,15.2,2300.156,1158.768,0.752591
AdaBoost (weighted),0.958543,0.756626,4279.7,182.0,1494.58,2603.12,0.842966
Baseline,0.996419,0.460141,2501.66,9.2,3272.62,-780.16,0.615133
Cost-sensitive sampling,0.962248,0.739424,4177.598,160.4,1596.682,2420.516,0.832819
Cost-sensitive threhold,0.961824,0.761995,4331.288,166.0,1442.992,2722.296,0.843481
Sample weighting,0.94984,0.792877,4602.572,232.4,1171.708,3198.464,0.863752
Subsampling,0.7072,0.869234,4901.638,2011.6,872.642,2017.396,0.774793


Finally, we can sort the aggregated DataFrame by the `'Cost F1'` column to easily see the methods that work best on our dataset.

In [15]:
df_agg.sort_values(by='Cost F1', ascending=False)

Unnamed: 0_level_0,Cost Precision,Cost Recall,TP Amount,FP Amount,FN Amount,Net Recovered Amount,Cost F1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sample weighting,0.94984,0.792877,4602.572,232.4,1171.708,3198.464,0.863752
Cost-sensitive threhold,0.961824,0.761995,4331.288,166.0,1442.992,2722.296,0.843481
AdaBoost (weighted),0.958543,0.756626,4279.7,182.0,1494.58,2603.12,0.842966
Cost-sensitive sampling,0.962248,0.739424,4177.598,160.4,1596.682,2420.516,0.832819
Subsampling,0.7072,0.869234,4901.638,2011.6,872.642,2017.396,0.774793
AdaBoost (unweighted),0.995531,0.612668,3474.124,15.2,2300.156,1158.768,0.752591
Baseline,0.996419,0.460141,2501.66,9.2,3272.62,-780.16,0.615133
