# Section 5.2 - Reproduction using SGLD

# Bayesian Logistic Regression with SGLD
This notebook reproduces the logistic regression experiment from the paper using SGLD on the a9a dataset.

In [None]:
#1. Dataset download & preparation
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import urllib.request
import os
import scipy.special

# Download a9a dataset if not already present
url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a"
filename = "a9a"
if not os.path.exists(filename):
    print("Downloading dataset...")
    urllib.request.urlretrieve(url, filename)

# Load the dataset
X, y = load_svmlight_file(filename)
X = X.toarray()
y = (y > 0).astype(int) * 2 - 1  # Convert to {-1, +1}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

N, D = X_train.shape
def sigmoid(z):
    return scipy.special.expit(z)

In [None]:
#2. Define the log joint probability function
def log_joint(beta, X_batch, y_batch):
    log_prior = -np.sum(np.abs(beta))  # Laplace prior (L1)
    logits = y_batch * (X_batch @ beta)
    log_lik = -np.sum(np.logaddexp(0, -logits))
    return log_prior + log_lik

In [None]:
#3. SGLD training on a9a dataset
# SGLD parameters
beta = np.zeros(D)
steps = 10000
batch_size = 10
samples = []

for t in range(steps):
    idx = np.random.choice(N, batch_size, replace=False)
    X_batch = X_train[idx]
    y_batch = y_train[idx]

    pred = sigmoid(y_batch * X_batch.dot(beta))
    grad_lik = ((1 - pred) * y_batch)[:, None] * X_batch
    grad_prior = -np.sign(beta)
    grad = grad_prior + N / batch_size * grad_lik.mean(axis=0)

    eps = 0.01 * (1 + t) ** -0.55
    beta += 0.5 * eps * grad + np.sqrt(eps) * np.random.randn(D)
    samples.append(beta.copy())

samples = np.array(samples[-1000:]) #keep last 1000
beta_mean = samples.mean(axis=0)

In [None]:
#4. Test accuracy evaluation
pred_test = np.sign(X_test.dot(beta_mean))
acc = accuracy_score(y_test, pred_test)
print(f"Test Accuracy: {acc:.4f}")

In [None]:
#5. Posterior mean & std summaries
posterior_mean = samples.mean(axis=0)
posterior_std = samples.std(axis=0)
print("Posterior mean (first 5):", posterior_mean[:5])
print("Posterior std (first 5):", posterior_std[:5])

In [None]:
#6. Trace plots for 5 weights
plt.figure(figsize=(12, 6))
for i in range(5):
    plt.plot([s[i] for s in samples], label=f"$\\beta_{i}$")
plt.title("Trace Plot of First 5 Coefficients")
plt.xlabel("Iteration")
plt.ylabel("Coefficient Value")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#7. Accuracy curve across iterations
accs = []
for beta_i in samples:
    pred = np.sign(X_test.dot(beta_i))
    accs.append(accuracy_score(y_test, pred))

plt.plot(accs)
plt.title("Test Accuracy over SGLD Iterations")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()

In [None]:
#8. Posterior histograms of most uncertain β
top5 = np.argsort(-posterior_std)[:5]
plt.figure(figsize=(12, 6))
for i, idx in enumerate(top5):
    plt.subplot(1, 5, i + 1)
    sns.histplot(samples[:, idx], bins=30, kde=True)
    plt.title(f"$\\beta_{{{idx}}}$")
plt.tight_layout()
plt.suptitle("Posterior Distributions of Most Uncertain Coefficients", y=1.02)
plt.show()

In [None]:
#9. AUC and Confusion Matrix for final predictions
probs = sigmoid(X_test.dot(beta_mean))
preds = np.sign(probs - 0.5)
auc = roc_auc_score((y_test + 1) // 2, probs)
cm = confusion_matrix(y_test, preds)

print(f"AUC: {auc:.4f}")
print("Confusion Matrix:")
print(cm)

In [None]:
#10. Perform multiple runs (e.g., 10), track log joint probability and test accuracy
num_runs = 10
steps = 1000
batch_size = 10

acc_runs = np.zeros((num_runs, steps))
logp_runs = np.zeros((num_runs, steps))

for run in range(num_runs):
    np.random.seed(run)
    beta = np.zeros(D)
    acc_list = []
    logp_list = []
    for t in range(steps):
        idx = np.random.choice(N, batch_size, replace=False)
        X_batch = X_train[idx]
        y_batch = y_train[idx]
        pred = sigmoid(y_batch * X_batch.dot(beta))
        grad_lik = ((1 - pred) * y_batch)[:, None] * X_batch
        grad_prior = -np.sign(beta)
        grad = grad_prior + N / batch_size * grad_lik.mean(axis=0)
        eps = 0.01 * (1 + t) ** -0.55
        beta += 0.5 * eps * grad + np.sqrt(eps) * np.random.randn(D)
        logp_list.append(log_joint(beta, X_train, y_train))
        acc_list.append(accuracy_score(y_test, np.sign(X_test @ beta)))
    acc_runs[run] = acc_list
    logp_runs[run] = logp_list

In [None]:
#11. Plot the results
x = np.arange(steps)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
mean_logp = logp_runs.mean(axis=0)
std_logp = logp_runs.std(axis=0)
plt.plot(x, mean_logp, label='Mean log joint')
plt.fill_between(x, mean_logp - std_logp, mean_logp + std_logp, alpha=0.2)
plt.title("Log Joint Probability")
plt.xlabel("Iteration")
plt.ylabel("Log Joint")

plt.subplot(1, 2, 2)
mean_acc = acc_runs.mean(axis=0)
std_acc = acc_runs.std(axis=0)
plt.plot(x, mean_acc, label='Mean accuracy')
plt.fill_between(x, mean_acc - std_acc, mean_acc + std_acc, alpha=0.2)
plt.title("Accuracy on Test Set")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.show()