In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import make_classification
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tqdm.notebook import tqdm  # progress bars
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from scipy.stats import linregress

# For reproducibility
np.random.seed(42)
tf.random.set_seed(42)

#########################################
# Custom split function
#########################################
def custom_split(X, y, train_ratio, train_min, test_min, random_state=None):
    n = X.shape[0]
    classes = np.unique(y)
    n_classes = len(classes)
    if n < train_min + test_min:
        raise ValueError(f"Not enough samples: n={n}, train_min={train_min}, test_min={test_min}")
    if random_state is not None:
        np.random.seed(random_state)
    T = int(round(train_ratio * n))
    if T < train_min:
        T = train_min
    if n - T < test_min:
        T = n - test_min
    # If T is less than number of classes, force training set to have one sample per class.
    if T < n_classes:
        train_idx = []
        for cls in classes:
            indices_cls = np.where(y == cls)[0]
            chosen = np.random.choice(indices_cls, size=1, replace=False)
            train_idx.append(chosen[0])
        train_idx = np.array(train_idx)
        test_idx = np.setdiff1d(np.arange(n), train_idx)
        if len(test_idx) < test_min:
            raise ValueError("Test set too small after forcing one sample per class.")
        return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    else:
        if n_classes >= 2:
            sss = StratifiedShuffleSplit(n_splits=1, train_size=T, test_size=n-T, random_state=random_state)
            train_idx, test_idx = next(sss.split(X, y))
            return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        else:
            indices = np.random.permutation(n)
            train_idx = indices[:T]
            test_idx = indices[T:]
            return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

#########################################
# Cross Validation Accuracy Functions
#########################################
def get_cv_folds(y, k=5):
    counts = np.bincount(y.astype(int))
    n_possible = np.min(counts)
    return min(k, n_possible) if n_possible >= 2 else 1

def compute_cv_accuracy_nn(X_train, y_train, num_epochs, input_dim):
    k = get_cv_folds(y_train, k=5)
    if k < 2:
        model = create_nn_model(input_dim)
        model.fit(X_train, y_train, epochs=num_epochs, batch_size=32, verbose=0)
        _, acc = model.evaluate(X_train, y_train, verbose=0)
        return acc
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_cv_train, X_cv_val = X_train[train_index], X_train[val_index]
        y_cv_train, y_cv_val = y_train[train_index], y_train[val_index]
        model_cv = create_nn_model(input_dim)
        model_cv.fit(X_cv_train, y_cv_train, epochs=num_epochs, batch_size=32, verbose=0)
        _, acc = model_cv.evaluate(X_cv_val, y_cv_val, verbose=0)
        accuracies.append(acc)
    return np.mean(accuracies)

def compute_cv_accuracy_rf(X_train, y_train):
    k = get_cv_folds(y_train, k=5)
    if k < 2:
        model_cv = RandomForestClassifier(n_estimators=100, random_state=42)
        model_cv.fit(X_train, y_train)
        return model_cv.score(X_train, y_train)
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_cv_train, X_cv_val = X_train[train_index], X_train[val_index]
        y_cv_train, y_cv_val = y_train[train_index], y_train[val_index]
        model_cv = RandomForestClassifier(n_estimators=100, random_state=42)
        model_cv.fit(X_cv_train, y_cv_train)
        acc = model_cv.score(X_cv_val, y_cv_val)
        accuracies.append(acc)
    return np.mean(accuracies)

def compute_cv_accuracy_sv(X_train, y_train):
    k = get_cv_folds(y_train, k=5)
    if k < 2:
        model_cv = SVC(probability=True, random_state=42)
        model_cv.fit(X_train, y_train)
        return model_cv.score(X_train, y_train)
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_cv_train, X_cv_val = X_train[train_index], X_train[val_index]
        y_cv_train, y_cv_val = y_train[train_index], y_train[val_index]
        model_cv = SVC(probability=True, random_state=42)
        model_cv.fit(X_cv_train, y_cv_train)
        acc = model_cv.score(X_cv_val, y_cv_val)
        accuracies.append(acc)
    return np.mean(accuracies)

#########################################
# Settings
#########################################
dataset_sizes = [100, 250, 500, 1000]  # minimum dataset size = 100
split_ratios = [0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
num_epochs = 50
experiments = []
model_types = ["NN", "RF", "SV"]

#########################################
# NN Model Creation Function
#########################################
def create_nn_model(input_dim):
    model = keras.Sequential([
        keras.Input(shape=(input_dim,)),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

#########################################
# Experiment Loop
#########################################
for X_size in tqdm(dataset_sizes, desc="Dataset sizes"):
    X, y = make_classification(n_samples=X_size, n_features=20,
                               n_informative=15, n_redundant=5, random_state=42)
    for ratio in tqdm(split_ratios, desc="Split ratios", leave=False):
        try:
            # Full dataset split: require training set min = 5 and test set min = 1.
            X_train_full, X_test, y_train_full, y_test = custom_split(
                X, y, train_ratio=ratio, train_min=5, test_min=1, random_state=int(ratio * 100))
        except ValueError as e:
            tqdm.write(f"Skipping ratio {ratio} for dataset size {X_size}: {e}")
            continue
        for m_type in tqdm(model_types, desc="Model types", leave=False):
            if m_type == "NN":
                cv_acc = compute_cv_accuracy_nn(X_train_full, y_train_full, num_epochs, X.shape[1])
                model = create_nn_model(X.shape[1])
                history = model.fit(X_train_full, y_train_full, epochs=num_epochs, batch_size=32, verbose=0)
                _, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
                final_train_acc = history.history['accuracy'][-1]
                avg_train_acc = np.mean(history.history['accuracy'])
            elif m_type == "RF":
                cv_acc = compute_cv_accuracy_rf(X_train_full, y_train_full)
                model = RandomForestClassifier(n_estimators=100, random_state=42)
                model.fit(X_train_full, y_train_full)
                test_accuracy = model.score(X_test, y_test)
                final_train_acc = model.score(X_train_full, y_train_full)
                avg_train_acc = final_train_acc  # RF is constant.
            elif m_type == "SV":
                cv_acc = compute_cv_accuracy_sv(X_train_full, y_train_full)
                model = SVC(probability=True, random_state=42)
                model.fit(X_train_full, y_train_full)
                test_accuracy = model.score(X_test, y_test)
                final_train_acc = model.score(X_train_full, y_train_full)
                avg_train_acc = final_train_acc  # SV is constant.

            exp_label = f"DS {X_size}, Split {int(ratio*100)}/{100-int(ratio*100)}, {m_type}"
            experiments.append({
                'dataset_size': X_size,
                'ratio': ratio,
                'model_type': m_type,
                'cv_acc': cv_acc,
                'final_train_acc': final_train_acc,
                'avg_train_acc': avg_train_acc,
                'test_acc': test_accuracy,
                'label': exp_label
            })

#########################################
# Plotting: Five Figures
#########################################
colors = {'NN': 'blue', 'RF': 'green', 'SV': 'orange'}

# Figure A: Test Accuracy vs Average Training Accuracy
plt.figure(figsize=(8,6))
for m in model_types:
    filtered = [exp for exp in experiments if exp['model_type'] == m]
    x = np.array([exp['avg_train_acc'] for exp in filtered])
    y = np.array([exp['test_acc'] for exp in filtered])
    plt.scatter(x, y, color=colors[m], alpha=0.7, label=f"{m} Data")
    if len(np.unique(x)) > 1:
        slope, intercept, r_value, p_value, std_err = linregress(x, y)
        R2 = r_value**2
        x_vals = np.linspace(min(x), max(x), 100)
        y_vals = slope * x_vals + intercept
        plt.plot(x_vals, y_vals, color=colors[m], linestyle='-', label=f"{m} Fit")
        plt.annotate(f"{m}: p = {p_value:.3g}, R² = {R2:.3g}",
                     xy=(0.05, 0.9 - 0.1*model_types.index(m)), xycoords='axes fraction',
                     bbox=dict(boxstyle="round", fc="w"))
plt.xlabel("Average Training Accuracy")
plt.ylabel("Test Accuracy")
plt.title("Test Accuracy vs Average Training Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# Figure B: Test Accuracy vs Training Proportion
plt.figure(figsize=(8,6))
for m in model_types:
    filtered = [exp for exp in experiments if exp['model_type'] == m]
    x = np.array([exp['ratio'] for exp in filtered])
    y = np.array([exp['test_acc'] for exp in filtered])
    plt.scatter(x, y, color=colors[m], alpha=0.7, label=f"{m} Data")
    if len(np.unique(x)) > 1:
        slope, intercept, r_value, p_value, std_err = linregress(x, y)
        R2 = r_value**2
        x_vals = np.linspace(min(x), max(x), 100)
        y_vals = slope * x_vals + intercept
        plt.plot(x_vals, y_vals, color=colors[m], linestyle='-', label=f"{m} Fit")
        plt.annotate(f"{m}: p = {p_value:.3g}, R² = {R2:.3g}",
                     xy=(0.05, 0.9 - 0.1*model_types.index(m)), xycoords='axes fraction',
                     bbox=dict(boxstyle="round", fc="w"))
plt.xlabel("Training Proportion")
plt.ylabel("Test Accuracy")
plt.title("Test Accuracy vs Training Proportion")
plt.legend()
plt.grid(True)
plt.show()

# Figure C: Test Accuracy vs Mean CV Accuracy
plt.figure(figsize=(8,6))
for m in model_types:
    filtered = [exp for exp in experiments if exp['model_type'] == m]
    x = np.array([exp['cv_acc'] for exp in filtered])
    y = np.array([exp['test_acc'] for exp in filtered])
    plt.scatter(x, y, color=colors[m], alpha=0.7, label=f"{m} Data")
    if len(np.unique(x)) > 1:
        slope, intercept, r_value, p_value, std_err = linregress(x, y)
        R2 = r_value**2
        x_vals = np.linspace(min(x), max(x), 100)
        y_vals = slope * x_vals + intercept
        plt.plot(x_vals, y_vals, color=colors[m], linestyle='-', label=f"{m} Fit")
        plt.annotate(f"{m}: p = {p_value:.3g}, R² = {R2:.3g}",
                     xy=(0.05, 0.9 - 0.1*model_types.index(m)), xycoords='axes fraction',
                     bbox=dict(boxstyle="round", fc="w"))
plt.xlabel("Mean CV Accuracy")
plt.ylabel("Test Accuracy")
plt.title("Test Accuracy vs Mean CV Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# Figure D: Test Accuracy vs Dataset Size
plt.figure(figsize=(8,6))
for m in model_types:
    filtered = [exp for exp in experiments if exp['model_type'] == m]
    x = np.array([exp['dataset_size'] for exp in filtered])
    y = np.array([exp['test_acc'] for exp in filtered])
    plt.scatter(x, y, color=colors[m], alpha=0.7, label=f"{m} Data")
    if len(np.unique(x)) > 1:
        slope, intercept, r_value, p_value, std_err = linregress(x, y)
        R2 = r_value**2
        x_vals = np.linspace(min(x), max(x), 100)
        y_vals = slope * x_vals + intercept
        plt.plot(x_vals, y_vals, color=colors[m], linestyle='-', label=f"{m} Fit")
        plt.annotate(f"{m}: p = {p_value:.3g}, R² = {R2:.3g}",
                     xy=(0.05, 0.9 - 0.1*model_types.index(m)), xycoords='axes fraction',
                     bbox=dict(boxstyle="round", fc="w"))
plt.xlabel("Dataset Size")
plt.ylabel("Test Accuracy")
plt.title("Test Accuracy vs Dataset Size")
plt.legend()
plt.grid(True)
plt.show()

# Figure E: Test Accuracy vs Final Training Accuracy
plt.figure(figsize=(8,6))
for m in model_types:
    filtered = [exp for exp in experiments if exp['model_type'] == m]
    x = np.array([exp['final_train_acc'] for exp in filtered])
    y = np.array([exp['test_acc'] for exp in filtered])
    plt.scatter(x, y, color=colors[m], alpha=0.7, label=f"{m} Data")
    if len(np.unique(x)) > 1:
        slope, intercept, r_value, p_value, std_err = linregress(x, y)
        R2 = r_value**2
        x_vals = np.linspace(min(x), max(x), 100)
        y_vals = slope * x_vals + intercept
        plt.plot(x_vals, y_vals, color=colors[m], linestyle='-', label=f"{m} Fit")
        plt.annotate(f"{m}: p = {p_value:.3g}, R² = {R2:.3g}",
                     xy=(0.05, 0.9 - 0.1*model_types.index(m)), xycoords='axes fraction',
                     bbox=dict(boxstyle="round", fc="w"))
plt.xlabel("Final Training Accuracy")
plt.ylabel("Test Accuracy")
plt.title("Test Accuracy vs Final Training Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# Summary Tables
print("\nSummary of Final Metrics (Unsorted):")
print("Dataset Size | Train/Test Ratio | Model | Mean CV Accuracy | Avg Train Accuracy | Final Train Accuracy | Test Accuracy")
for exp in experiments:
    ratio_percent = f"{int(exp['ratio']*100)}/{100-int(exp['ratio']*100)}"
    print(f"{exp['dataset_size']:12d} | {ratio_percent:15s} | {exp['model_type']:5s} | {exp['cv_acc']:.3f}   | {exp['avg_train_acc']:.3f}   | {exp['final_train_acc']:.3f}   | {exp['test_acc']:.3f}")

sorted_experiments = sorted(experiments, key=lambda exp: exp['test_acc'], reverse=True)
print("\nRanked by Test Accuracy (Highest to Lowest):")
print("Rank | Dataset Size | Train/Test Ratio | Model | Mean CV Accuracy | Avg Train Accuracy | Final Train Accuracy | Test Accuracy")
for idx, exp in enumerate(sorted_experiments, start=1):
    ratio_percent = f"{int(exp['ratio']*100)}/{100-int(exp['ratio']*100)}"
    print(f"{idx:4d} | {exp['dataset_size']:12d} | {ratio_percent:15s} | {exp['model_type']:5s} | {exp['cv_acc']:.3f}   | {exp['avg_train_acc']:.3f}   | {exp['final_train_acc']:.3f}   | {exp['test_acc']:.3f}")
