In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm

In [2]:
# https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
df = pd.read_csv('creditcard.csv')
print(df.columns)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [3]:
all_x = df[df.columns[:-1]]
all_y = df['Class']

In [4]:
# https://www.kdnuggets.com/2023/03/bootstrapping.html
def bootstrap(X_df, y_df, n_samples, train_size):
    seed = []
    pr = []
    re = []
    f1 = []
    indices = []

    for i in tqdm(range(n_samples)):
        np.random.seed(i)
        index = np.random.choice(X_df.shape[0], size=X_df.shape[0], replace=True)
        indices.append(index)
        X_sample = X_df.values[index, :]
        y_sample = y_df.values[index]

        # Train on this sample using just a little bit of the data
        X_train, X_test, y_train, y_test = train_test_split(
            X_sample, y_sample,
            train_size=train_size,
            stratify=y_sample,
            random_state=i
        )

        model = MLPClassifier().fit(X_train, y_train)
        y_pred = model.predict(X_test)

        seed.append(i)
        pr.append(precision_score(y_test, y_pred))
        re.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))

    return pd.DataFrame({
        "Seed": seed,
        "Precision": pr,
        "Recall": re,
        "F1": f1,
        "Indices": indices
    })

In [5]:
pred_df = bootstrap(all_x, all_y, 10, 0.1)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.91s/it]


In [6]:
pred_df.sort_values(by="F1", ascending=False).head()

Unnamed: 0,Seed,Precision,Recall,F1,Indices
9,9,0.414219,0.642686,0.503759,"[187006, 196534, 173819, 267158, 91261, 70209,..."
3,3,0.296552,0.668889,0.410922,"[71530, 198296, 77049, 48056, 11261, 215699, 2..."
8,8,0.441489,0.198565,0.273927,"[70083, 25940, 149489, 256361, 231557, 236371,..."
5,5,0.402778,0.130631,0.197279,"[18638, 20463, 124605, 232422, 136592, 33800, ..."
4,4,0.178571,0.034404,0.057692,"[120705, 129384, 115144, 94601, 107578, 79981,..."


In [7]:
# Get the indices of the data that yielded the best results from the sampling
best_idx = pred_df.sort_values(by="F1", ascending=False).index[0]
bootstrapped_idx = pred_df.iloc[best_idx]["Indices"]
bootstrapped_seed = pred_df.iloc[best_idx]["Seed"]

In [8]:
X_sample = all_x.values[bootstrapped_idx, :]
y_sample = all_y.values[bootstrapped_idx]

# Re-train a model, but with the (hopefully) optimized data
np.random.seed(bootstrapped_seed)
X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample,
    train_size=0.4,
    stratify=y_sample,
    random_state=bootstrapped_seed
)

print("Fitting model...", end=' ')
model = MLPClassifier().fit(X_train, y_train)
print("done.")

Fitting model... done.


In [19]:
# Now test back on the original data to see metrics
X_test = all_x.values
y_test = all_y.values

y_pred = model.predict(X_test)

print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1       :", f1_score(y_test, y_pred))

Precision: 0.5863247863247864
Recall   : 0.6971544715447154
F1       : 0.6369545032497679
