In [18]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Generate an imbalanced dataset
X, y = make_classification(n_samples=100, weights=[0.9, 0.1], 
                           n_features=2, n_classes=2, n_informative=2, n_redundant=0)

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Define predictive distribution (simple Gaussian predictive inference)
def predictive_resampling(X_minority, n_new_samples):
    mean = X_minority.mean(axis=0)
    cov = np.cov(X_minority, rowvar=False)
    return np.random.multivariate_normal(mean, cov, n_new_samples)

# Original data
X_minority = X_train[y_train == 1]
n_new_samples = len(X_train[y_train == 0]) - len(X_minority)

# Generate synthetic minority samples
X_synthetic = predictive_resampling(X_minority, n_new_samples)
X_train_augmented = np.vstack([X_train, X_synthetic])
y_train_augmented = np.hstack([y_train, np.ones(n_new_samples)])

# Classifier without oversampling
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Without Predictive Oversampling:\n", classification_report(y_test, y_pred))
print("Balanced Accuracy (without oversampling):", balanced_accuracy_score(y_test, y_pred))
print("F1 Score (without oversampling):", f1_score(y_test, y_pred))

# Classifier with predictive oversampling
clf_aug = LogisticRegression().fit(X_train_augmented, y_train_augmented)
y_pred_aug = clf_aug.predict(X_test)
print("With Predictive Oversampling:\n", classification_report(y_test, y_pred_aug))
print("Balanced Accuracy (with oversampling):", balanced_accuracy_score(y_test, y_pred_aug))
print("F1 Score (with oversampling):", f1_score(y_test, y_pred_aug))


Without Predictive Oversampling:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93        26
           1       0.00      0.00      0.00         4

    accuracy                           0.87        30
   macro avg       0.43      0.50      0.46        30
weighted avg       0.75      0.87      0.80        30

Balanced Accuracy (without oversampling): 0.5
F1 Score (without oversampling): 0.0
With Predictive Oversampling:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        26
           1       0.67      1.00      0.80         4

    accuracy                           0.93        30
   macro avg       0.83      0.96      0.88        30
weighted avg       0.96      0.93      0.94        30

Balanced Accuracy (with oversampling): 0.9615384615384616
F1 Score (with oversampling): 0.8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [3]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Generate an imbalanced dataset
X, y = make_classification(
    n_samples=1000, weights=[0.9, 0.1], 
    n_features=2, n_classes=2, n_informative=2, n_redundant=0, random_state=42
)

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Metrics reporting function
def report_scores(y_true, y_pred, method):
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_per_class = f1_score(y_true, y_pred, average=None)
    mcc = matthews_corrcoef(y_true, y_pred)
    print(f"\n--- {method} ---")
    print(classification_report(y_true, y_pred, digits=3))
    print(f"Balanced Accuracy: {bal_acc:.3f}")
    print(f"F1-score (macro): {f1_macro:.3f}")
    print(f"F1-score (per class): {f1_per_class}")
    print(f"Matthews Corrcoef: {mcc:.3f}")
    return {
        "method": method,
        "balanced_accuracy": bal_acc,
        "f1_macro": f1_macro,
        "f1_per_class": f1_per_class,
        "mcc": mcc
    }

# 1. No resampling (Vanilla)
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores_vanilla = report_scores(y_test, y_pred, "Vanilla (No Resampling)")

# 2. SMOTE oversampling
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
clf_sm = LogisticRegression().fit(X_train_sm, y_train_sm)
y_pred_sm = clf_sm.predict(X_test)
scores_smote = report_scores(y_test, y_pred_sm, "SMOTE Oversampling")

# 3. Bayesian Bootstrap oversampling (Dirichlet weights)
def bayesian_bootstrap_resample(X_minority, n_new_samples):
    n = X_minority.shape[0]
    # Each synthetic point is a convex combination of the observed points with Dirichlet weights
    weights = np.random.dirichlet(np.ones(n), n_new_samples)
    X_synth = weights @ X_minority  # Matrix multiply (n_new_samples x n) @ (n x d) = (n_new_samples x d)
    return X_synth

X_minority = X_train[y_train == 1]
n_new_samples = len(X_train[y_train == 0]) - len(X_minority)
X_synth_bayes = bayesian_bootstrap_resample(X_minority, n_new_samples)
X_train_bayes = np.vstack([X_train, X_synth_bayes])
y_train_bayes = np.hstack([y_train, np.ones(n_new_samples)])
clf_bayes = LogisticRegression().fit(X_train_bayes, y_train_bayes)
y_pred_bayes = clf_bayes.predict(X_test)
scores_bayes = report_scores(y_test, y_pred_bayes, "Bayesian Bootstrap Oversampling")

# 4. Random undersampling (majority class)
rus = RandomUnderSampler(random_state=42)
X_train_us, y_train_us = rus.fit_resample(X_train, y_train)
clf_us = LogisticRegression().fit(X_train_us, y_train_us)
y_pred_us = clf_us.predict(X_test)
scores_us = report_scores(y_test, y_pred_us, "Random Undersampling")

# 5. Predictive Oversampling (already done above)
# Define predictive distribution (simple Gaussian predictive inference)
def predictive_resampling(X_minority, n_new_samples):
    mean = X_minority.mean(axis=0)
    cov = np.cov(X_minority, rowvar=False)
    return np.random.multivariate_normal(mean, cov, n_new_samples)

# Original data
X_minority = X_train[y_train == 1]
n_new_samples = len(X_train[y_train == 0]) - len(X_minority)

# Generate synthetic minority samples
X_synthetic = predictive_resampling(X_minority, n_new_samples)
X_train_augmented = np.vstack([X_train, X_synthetic])
y_train_augmented = np.hstack([y_train, np.ones(n_new_samples)])
# Classifier with predictive oversampling
clf_aug = LogisticRegression().fit(X_train_augmented, y_train_augmented)
y_pred_aug = clf_aug.predict(X_test)
scores_predictive = report_scores(y_test, y_pred_aug, "Predictive Oversampling")


# Summary Table
import pandas as pd

summary = pd.DataFrame([
    scores_vanilla,
    scores_smote,
    scores_bayes,
    scores_us,
    scores_predictive
])

# Only show summary scores, not per-class F1 (for table neatness)
summary_short = summary[["method", "balanced_accuracy", "f1_macro", "mcc"]]
print("\n=== Summary Table ===")
print(summary_short.to_string(index=False))



--- Vanilla (No Resampling) ---
              precision    recall  f1-score   support

           0      0.950     0.989     0.969       268
           1      0.857     0.562     0.679        32

    accuracy                          0.943       300
   macro avg      0.903     0.776     0.824       300
weighted avg      0.940     0.943     0.938       300

Balanced Accuracy: 0.776
F1-score (macro): 0.824
F1-score (per class): [0.96892139 0.67924528]
Matthews Corrcoef: 0.667

--- SMOTE Oversampling ---
              precision    recall  f1-score   support

           0      0.983     0.881     0.929       268
           1      0.467     0.875     0.609        32

    accuracy                          0.880       300
   macro avg      0.725     0.878     0.769       300
weighted avg      0.928     0.880     0.895       300

Balanced Accuracy: 0.878
F1-score (macro): 0.769
F1-score (per class): [0.92913386 0.60869565]
Matthews Corrcoef: 0.583

--- Bayesian Bootstrap Oversampling ---
    

In [4]:
import numpy as np
from scipy.stats import multivariate_normal

def sample_multivariate_t(mu, Sigma, df, n_samples):
    """
    Draws samples from a multivariate Student-t distribution.
    
    Parameters
    ----------
    mu : np.ndarray
        Mean vector (d,)
    Sigma : np.ndarray
        Scale matrix (d, d)
    df : float
        Degrees of freedom
    n_samples : int
        Number of samples
        
    Returns
    -------
    np.ndarray
        Samples (n_samples, d)
    """
    d = len(mu)
    g = np.tile(np.random.gamma(df/2., 2./df, n_samples), (d,1)).T  # shape (n_samples, d)
    Z = np.random.multivariate_normal(np.zeros(d), Sigma, n_samples)
    return mu + Z / np.sqrt(g)


In [5]:
def bayesian_predictive_parameters(X):
    """
    Returns parameters for posterior predictive multivariate t:
        mean, scale, dof
    Uses conjugate normal-inverse-Wishart prior (weak, uninformative)
    """
    n, d = X.shape
    mu = X.mean(axis=0)
    S = np.cov(X, rowvar=False)
    # Weak prior: prior mean = sample mean, prior kappa = 1e-6, prior dof = d+2
    kappa_0 = 1e-6
    nu_0 = d + 2
    # Posterior parameters
    kappa_n = kappa_0 + n
    nu_n = nu_0 + n
    mu_n = (kappa_0 * mu + n * mu) / kappa_n
    S_n = S + (kappa_0 * n) / kappa_n * np.outer(mu - mu, mu - mu)
    # Predictive mean, scale, dof
    mean_pred = mu_n
    scale_pred = S_n * (kappa_n + 1) / (kappa_n * (nu_n - d + 1))
    df_pred = nu_n - d + 1
    return mean_pred, scale_pred, df_pred


In [6]:
# Assume X_train, y_train are already defined and minority class is 1
X_minority = X_train[y_train == 1]
n_majority = np.sum(y_train == 0)
n_minority = np.sum(y_train == 1)
n_new_samples = n_majority - n_minority

# Get posterior predictive parameters
mu_pred, Sigma_pred, df_pred = bayesian_predictive_parameters(X_minority)

# Sample synthetic minority points
X_synth = sample_multivariate_t(mu_pred, Sigma_pred, df_pred, n_new_samples)
y_synth = np.ones(n_new_samples)  # class label

# Augment training data
X_train_aug = np.vstack([X_train, X_synth])
y_train_aug = np.hstack([y_train, y_synth])


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score

clf = LogisticRegression().fit(X_train, y_train)
clf_bayes = LogisticRegression().fit(X_train_aug, y_train_aug)

y_pred = clf.predict(X_test)
y_pred_bayes = clf_bayes.predict(X_test)

print("Original Data:\n", classification_report(y_test, y_pred))
print("Bayesian Predictive Resampling:\n", classification_report(y_test, y_pred_bayes))


Original Data:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       268
           1       0.86      0.56      0.68        32

    accuracy                           0.94       300
   macro avg       0.90      0.78      0.82       300
weighted avg       0.94      0.94      0.94       300

Bayesian Predictive Resampling:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96       268
           1       0.65      0.88      0.75        32

    accuracy                           0.94       300
   macro avg       0.82      0.91      0.86       300
weighted avg       0.95      0.94      0.94       300

