In [1]:
from itertools import product
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from lib.creditcard_fraud_dataset import get_train_test_dfs
from lib.cs_train import train_clf
from lib.cs_eval import evaluate_clf

In [2]:
df_trainval, df_test = get_train_test_dfs()

In [3]:
methods = {}

## Baseline

In [4]:
methods.update({
    'Baseline': {
        'train': lambda df_train: dict(),
        'test': lambda df_test: dict()
    },
    'Baseline (class balanced)': {
        'train': lambda df_train: dict(
            class_weight='balanced',
        ),
        'test': lambda df_test: dict()
    },
})

## Sample weighting

In [5]:
methods.update({
    'Sample Weighted': {
        'train': lambda df_train: dict(
            sample_weight=df_train['C_misclf'],
        ),
        'test': lambda df_test: dict()
    },
    'Sample Weighted (class balanced)': {
        'train': lambda df_train: dict(
            class_weight='balanced',
            sample_weight=df_train['C_misclf'],
        ),
        'test': lambda df_test: dict()
    },
})

## Subsampling

In [6]:
methods.update({
    'Subsampled': {
        'train': lambda df_train: dict(
            n_neg=df_train['Class'].sum(),
        ),
        'test': lambda df_test: dict()
    }
})

## Cost-sensitive sampling

In [7]:
def get_cost_sensitive_sampling_ratios(df_train):
    n_pos = df_train['Class'].sum()
    n_neg = len(df_train) - n_pos

    return (df_train['C_FN'] / df_train['C_FP']) * (n_pos / n_neg)


methods.update({
    'Cost-sensitive sampling': {
        'train': lambda df_train: dict(
            n_neg=int(df_train['Class'].sum() / get_cost_sensitive_sampling_ratios(df_train).mean())
        ),
        'test': lambda df_test: dict(),
    }
})

## Cost-sensitive threshold

In [8]:
def get_cs_threshs(df):
    return df['C_FP'] / (df['C_FP'] + df['C_FN'])


methods.update({
    "CS thresh": {
        'train': lambda df_train: dict(),
        'test': lambda df_test: dict(thresh=get_cs_threshs(df_test)),
    },
    "CS thresh (class balanced)": {
        'train': lambda df_train: dict(class_weight='balanced'),
        'test': lambda df_test: dict(thresh=get_cs_threshs(df_test)),
    }
})

## AdaBoost with cost-sensitive weight initialization

In [9]:
from sklearn.ensemble import AdaBoostClassifier


methods.update({
    "AdaBoost CS weight": {
        'train': lambda df_train: dict(
            Classifier=AdaBoostClassifier,
            sample_weight=df_train['C_misclf']
        ),
        'test': lambda df_test: dict(),
    },
    "AdaBoost baseline": {
        'train': lambda df_train: dict(
            Classifier=AdaBoostClassifier,
        ),
        'test': lambda df_test: dict(),
    }
})

# Cross-validation

In [10]:
from sklearn.model_selection import StratifiedKFold
from lib.creditcard_fraud_dataset import get_X_from_df, get_Y_from_df

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, random_state=None,
                      shuffle=False)

df_results = []

for train_index, val_index in tqdm(skf.split(get_X_from_df(df_trainval),
                                             get_Y_from_df(df_trainval)),
                                   total=n_splits):
    df_train = df_trainval.iloc[train_index]
    df_val = df_trainval.iloc[val_index]

    for method_name, method_dict in tqdm(methods.items(), leave=False):
        clf = train_clf(
            df_train,
            **method_dict['train'](df_train)
        )
        eval_metrics = evaluate_clf(
            clf,
            df_val,
            **method_dict['test'](df_val)
        )
        df_results.append({
            'method': method_name,
            **eval_metrics
        })

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
import pandas as pd

# When you ran the code yourself, uncomment the next line...
# df_results = pd.DataFrame(df_results)

# ...and remove the following one
df_result = pd.read_pickle('results.pkl')

In [12]:
df_results['cost_f1'] = (
    2 * df_results['cost_precision'] * df_results['cost_recall']
    / (df_results['cost_precision'] + df_results['cost_recall'])
)

In [13]:
gb = df_results.groupby('method')

gb.agg('mean').sort_values(by='cost_f1', ascending=False)

Unnamed: 0_level_0,cost_precision,cost_recall,tp_amount,fp_amount,fn_amount,net,cost_f1
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sample Weighted,0.94984,0.792877,4602.572,232.4,1171.708,3198.464,0.863752
AdaBoost CS weight,0.985046,0.756829,4316.12,64.8,1458.16,2793.16,0.852575
CS thresh,0.961824,0.761995,4331.288,166.0,1442.992,2722.296,0.843481
Cost-sensitive sampling,0.965602,0.739424,4177.598,147.2,1596.682,2433.716,0.834276
Baseline (class balanced),0.790545,0.864755,4876.886,1279.2,897.394,2700.292,0.821186
AdaBoost baseline,0.995302,0.626485,3545.582,16.8,2228.698,1300.084,0.764525
Subsampled,0.672037,0.838415,4734.536,2328.0,1039.744,1366.792,0.739977
Baseline,0.996419,0.460141,2501.66,9.2,3272.62,-780.16,0.615133
Sample Weighted (class balanced),0.447209,0.950304,5475.594,7148.8,298.686,-1971.892,0.597756
CS thresh (class balanced),0.287251,0.984924,5674.518,14339.6,99.762,-8764.844,0.441279
