Fourth sandbox. In sandbox 3, the plot of F1 against `train_size` seemed to have an elbow at around 0.6, which happened to be the `train_size` used for the original bootstrap training. This sandbox tests the following hypothesis:
- if the bootstrap training was done with `train_size=0.1`, then the metrics generated across model predictions on a split generated from `train_test_split(train_size=?)` will have an elbow around `train_size=0.1`
- if the bootstrap training was done with `train_size=0.2`, then the metrics generated across model predictions on a split generated from `train_test_split(train_size=?)` will have an elbow around `train_size=0.2`
- and so on.

# Imports and setup

In [5]:
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm
import matplotlib.pyplot as plt

In [6]:
# https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
df = pd.read_csv('creditcard.csv')
print(df.columns)
print(df['Class'].value_counts())

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
Class
0    284315
1       492
Name: count, dtype: int64


In [7]:
X_all = df[df.columns[:-1]].values
y_all = df['Class'].values

In [8]:
def fit_with_size(X_sample, y_sample, seed, train_size, verbose=False):
    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_sample,
        train_size=train_size,
        stratify=y_sample,
        random_state=seed  # make less random
    )

    if verbose:
        print("Fitting model with", train_size, "of the data...", end=' ')
        start_time = time()
    np.random.seed(seed)  # make less random
    model = DecisionTreeClassifier().fit(X_train, y_train)
    if verbose:
        end_time = time()
        print("done in", round(end_time-start_time), "seconds.")

    return model, X_test, y_test

In [9]:
def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return precision, recall, f1

In [10]:
def plot_metrics(df):
    plt.subplot(1, 3, 1)
    df["Precision"].plot.barh()
    plt.title("Precision")
    plt.subplot(1, 3, 2)
    df["Recall"].plot.barh()
    plt.title("Recall")
    plt.subplot(1, 3, 3)
    df["F1"].plot.barh()
    plt.title("F1")

# Bootstrapping a Decision Tree

In [12]:
# https://www.kdnuggets.com/2023/03/bootstrapping.html
def bootstrap(X, y, n_samples, train_size):
    results = []
    for i in tqdm(range(n_samples)):
        run = {}
        np.random.seed(i)  # make less random
        indices = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
        X_sample = X[indices, :]
        y_sample = y[indices]

        model, X_test, y_test = fit_with_size(X_sample, y_sample, i, train_size)
        precision, recall, f1 = get_metrics(model, X_test, y_test)

        run["Seed"] = i
        run["Precision"] = precision
        run["Recall"] = recall
        run["F1"] = f1
        run["Indices"] = indices
        run["Model"] = model
        results.append(run)
    return pd.DataFrame(results)

In [None]:
sizes = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8)
data = {}
start_time = time()
for s_boot in sizes:
    print("Bootstrapping with size", s_boot)
    pred_df = bootstrap(X_all, y_all, 5, s_boot)
    
    best_idx = pred_df.sort_values(by="F1", ascending=False).index[0]
    bootstrapped_idx = pred_df.iloc[best_idx]["Indices"]
    bootstrapped_seed = pred_df.iloc[best_idx]["Seed"]
    
    X_sample = X_all[bootstrapped_idx, :]
    y_sample = y_all[bootstrapped_idx]

    fit = []
    for s_fit in sizes:
        model, X_test, y_test = fit_with_size(X_sample, y_sample, bootstrapped_seed, s_fit, verbose=True)
        precision, recall, f1 = get_metrics(model, X_test, y_test)
        fit.append(f1)
    data[s_boot] = fit
    print("Done with size", s_boot)
end_time = time()
print("That took", round(end_time-start_time), "seconds.")

Bootstrapping with size 0.1


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:13<00:00,  2.77s/it]


Fitting model with 0.1 of the data... done in 1 seconds.
Fitting model with 0.2 of the data... done in 6 seconds.
Fitting model with 0.3 of the data... done in 9 seconds.
Fitting model with 0.4 of the data... done in 9 seconds.
Fitting model with 0.5 of the data... done in 11 seconds.
Fitting model with 0.6 of the data... done in 18 seconds.
Fitting model with 0.7 of the data... done in 21 seconds.
Fitting model with 0.8 of the data... done in 23 seconds.
Done with size 0.1
Bootstrapping with size 0.2


 20%|████████████████▊                                                                   | 1/5 [00:05<00:21,  5.36s/it]

In [None]:
df_data = pd.DataFrame(data)
df_data["Size"] = sizes
df_data

In [None]:
colors = [plt.cm.viridis(s) for s in np.linspace(0, 1, num=len(sizes))]
_ = df_data.plot(x="Size", color=colors)