# On the properties of variational approximations of Gibbs posteriors

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from scipy.stats import norm
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo 
import torch.distributions as dist
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from tqdm import tqdm
from time import sleep

import warnings
warnings.filterwarnings("ignore")

torch.pi = torch.acos(torch.zeros(1))*2 # 3.1415927410125732

torch.manual_seed(0)
np.random.seed(0)

## Loss 0-1 

In [2]:
class Loss01(nn.Module):
    def __init__(self, p, v, lambda_val, B, n, m = None, si=None):
        super(Loss01, self).__init__()
        self.v = v
        self.lambda_val = lambda_val
        self.B = B
        self.m = nn.Parameter(m) if m is not None else nn.Parameter(torch.rand(p))
        self.si = nn.Parameter(si) if si is not None else nn.Parameter(torch.rand(1))
        torch.manual_seed(0)
        self.normal_dist = dist.Normal(torch.zeros(n), torch.ones(n))

    def forward(self, X, Y):
        n, p = X.shape
        # Get random indices
        torch.manual_seed(0)
        indices = torch.randint(0, n, (self.B,)) if n != self.B else torch.arange(self.B)
        X_selected = X[indices]
        Y_selected = Y[indices]
        
        G = Y_selected * X_selected
        nG = torch.linalg.norm(G, dim=1)
        mG = torch.sum(G * self.m, dim=1)
        denom = torch.sqrt(torch.linalg.norm(X_selected, dim=1)) * self.si 
        term = mG / denom
        
        loss = self.normal_dist.cdf(-term)
        reg_term = 0.5 * torch.dot(self.m.flatten(), self.m.flatten()) / self.v**2
        
        loss = (self.lambda_val / self.B) * loss + reg_term - 0.5 * p * (torch.log(self.si ** 2) - self.si ** 2 / self.v**2)
        return loss.mean()
    
    def bound(self, X, Y, c_x, eps=0.01):
        n, p = X.shape
        eps = torch.tensor(eps)
        f = self.lambda_val/(2*n)
        KL = 0.5*(p*self.si**2/self.v**2 - p + torch.dot(self.m.flatten(), self.m.flatten()) / self.v**2 + p * torch.log(self.v**2 / self.si**2))
        return (f + KL + torch.log(1/eps))/self.lambda_val
    

## Hinge Loss

In [3]:
class Hinge(nn.Module):
    def __init__(self, p, v, lambda_, B, n, m = None, si = None):
        super(Hinge, self).__init__()
        self.v = v
        self.lambda_ = lambda_
        self.B = B
        self.m = nn.Parameter(m) if m is not None else nn.Parameter(torch.rand(p))
        self.si = nn.Parameter(si) if si is not None else nn.Parameter(torch.rand(1))
        torch.manual_seed(0)
        self.normal_dist = dist.Normal(torch.zeros(n), torch.ones(n))
        
    def forward(self, X, Y):
        n, p = X.shape
        # Get random indices
        indices = torch.randint(0, n, (self.B,)) if n != self.B else torch.arange(self.B)
        X_selected = X[indices]
        Y_selected = Y[indices]
        torch.manual_seed(0)
        
        G = Y_selected * X_selected
        G = G.view(self.B, p)
        nG = torch.linalg.norm(G, dim=1)  # |Y_iX_i|
        mG = torch.sum(G * self.m, dim=1) # <Y_iX_i, m>
        ratio = (1 - mG) / (self.si * nG) # (1 - <Y_iX_i, m>) / (si * |Y_iX_i|)
        term1 = (1 - mG) *  self.normal_dist.cdf(ratio) # (1 - <Y_iX_i, m>) * Phi(ratio)
        term2 = self.si * nG * torch.exp(-0.5 * ratio ** 2) / torch.sqrt(2 * torch.tensor(np.pi)) # si * |Y_iX_i| * phi(ratio)
        sum_val = torch.sum(term1 + term2)
        
        loss = (self.lambda_ / self.B) * sum_val + 0.5 * torch.dot(self.m.view(-1), self.m.view(-1)) / self.v**2 - 0.5 * p * (torch.log(self.si ** 2) - self.si ** 2 / self.v**2)
        return loss
    
    def bound(self, X, Y, c_x = None, eps=0.01):
        n, p = X.shape
        c_x = torch.tensor(torch.max(X) + 0.1) if c_x is None else torch.tensor(c_x)
        eps = torch.tensor(eps)
        if (1-self.v**2 * self.lambda_**2 * c_x**2/2*n).item() > 0:   
            f = self.lambda_**2/(4*n) - 0.5*torch.log(1-self.v**2 * self.lambda_**2 * c_x**2/2*n)
        else:
            return torch.tensor(-1)
        KL = 0.5*(p*self.si**2/self.v**2 - p + torch.dot(self.m.flatten(), self.m.flatten()) / self.v**2 + p * torch.log(self.v**2 / self.si**2))
        return (f + KL + torch.log(1/eps))/self.lambda_
    

## Exponential Loss

In [4]:
class Exponential(nn.Module):
    def __init__(self, p, v, lambda_, B, n, m= None, si = None):
        super(Exponential, self).__init__()
        self.v = v
        self.lambda_ = lambda_
        self.B = B
        self.m = nn.Parameter(m) if m is not None else nn.Parameter(torch.rand(p))
        self.si = nn.Parameter(si) if si is not None else nn.Parameter(torch.rand(1))
        
    def forward(self, X, Y):
        n, p = X.shape
        # Get random indices
        indices = torch.randint(0, n, (self.B,)) if n != self.B else torch.arange(self.B)
        X_selected = X[indices]
        Y_selected = Y[indices]
        
        G = Y_selected * X_selected
        G = G.view(self.B, p)
        nG = torch.linalg.norm(G, dim=1) #|Yi Xi|
        mG = torch.sum(G * self.m, dim=1) #|Yi Xi| * m	
        term = torch.exp(-mG + 0.5 * (self.si ** 2) * (nG ))
        sum_val = torch.sum(term)
        
        loss = (self.lambda_ / self.B) * sum_val + 0.5 * torch.dot(self.m.view(-1), self.m.view(-1)) / self.v**2 - 0.5 * p * (torch.log(self.si ** 2) - self.si ** 2 / self.v**2)
        return loss


## Score functions

In [5]:
def estimator(X, theta):
    return np.sign(np.dot(X, theta.T))

# misclassification rate
def misclassification_rate(X, Y, m, si):
    n = X.shape[0]
    X = X.detach().numpy()
    Y = Y.detach().numpy()
    
    # on va générer des theta, calculer les estimateurs associés, puis la moyenne des estimateurs:
    np.random.seed(0)
    theta = np.random.normal(m, si, (1000, len(m)))
    estimators = np.array([estimator(X, theta_) for theta_ in theta])
    mean_estimates = np.sign(np.mean(estimators, axis=0)).reshape((-1,1))
    error = np.mean(mean_estimates != Y)
    return error

# hinge loss
def hinge_misclassification_rate(X, Y, m, si):
    n = X.shape[0]
    X = X.detach().numpy()
    Y = Y.detach().numpy()
    np.random.seed(0)
    theta = np.random.normal(m, si, (1000, len(m)))
    estimators = np.array([estimator(X, theta_) for theta_ in theta])
    mean_estimates = np.sign(np.mean(estimators, axis=0)).reshape((-1,1))
    error = np.mean(np.maximum(0, 1 - mean_estimates * Y))
    return error

# exponential loss
def exponential_loss(Y_hat, Y):
    return np.exp(-Y_hat * Y)

def exp_loss_misclassification_rate(X, Y, m, si):
    n = X.shape[0]
    X = X.detach().numpy()
    Y = Y.detach().numpy()
    np.random.seed(0)
    theta = np.random.normal(m, si, (1000, len(m)))
    estimators = np.array([estimator(X, theta_) for theta_ in theta])
    mean_estimates = np.sign(np.mean(estimators, axis=0)).reshape((-1,1))
    error = np.mean(exponential_loss(mean_estimates,Y))
    return error

## Comparaison with logisitc regression and SVM 

In [6]:
# Functions to compare VB with Logistic Regressor & SVM :
def compare_logisitic(Xtrain, Ytrain, Xtest, Ytest):
    # change the tensors into np: 
    X_train, Y_train, X_test, Y_test = map(torch.Tensor.numpy, (Xtrain, Ytrain, Xtest, Ytest))
    print('Logistic regression : ')
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train)
    y_pred_clf = clf.predict(X_test)
    error = np.mean(y_pred_clf != Y_test)
    print('error logistic regression : ', error)
    print('error hinge loss : ', np.maximum(0, 1 - y_pred_clf * Y_test).mean())
    print('exponential loss : ', np.exp(-y_pred_clf * Y_test).mean())

def compare_SVM(Xtrain, Ytrain, Xtest, Ytest):
    # change the tensors into np: 
    X_train, Y_train, X_test, Y_test = map(torch.Tensor.numpy, (Xtrain, Ytrain, Xtest, Ytest))
    print('\nSVM : ')
    svm = SVC(kernel='sigmoid', random_state=0)
    svm.fit(X_train, Y_train)
    y_pred_svm = svm.predict(X_test)
    error = np.mean(y_pred_svm != Y_test)
    print('error SVM : ', error)
    print('error hinge loss : ', np.maximum(0, 1 - y_pred_svm * Y_test).mean())
    print('exponential loss : ', np.exp(-y_pred_svm * Y_test).mean())


## Optimization Process

In [7]:
## Optimization process for any loss function type (loss_type in {'01', 'hinge', 'exp'})
model_classes = {
    'hinge': Hinge,
    'exp': Exponential,
    '01': Loss01
}

def training(Xtrain, Ytrain, Xtest, Ytest, loss_type = 'hinge', n_epochs = 1000, lr=0.01, m = None, si = None, divide_by_c_x = True, verbose = True):
    n, p = Xtrain.shape
    c_x = torch.max(Xtrain) + 0.01
    v = 1/np.sqrt(p)
    lambda_ = np.sqrt(n*p)
    if divide_by_c_x:
        lambda_ /= c_x
    B = n

    model_class = model_classes.get(loss_type)
    if model_class is not None:
        model = model_class(p, v, lambda_, B, n, m, si)
    else:
        raise ValueError("Unknown loss_type: {}".format(loss_type))
    
    torch.manual_seed(0)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    print(f"\n Training with model {loss_type} ...")
    for epoch in tqdm(range(n_epochs)):
        optimizer.zero_grad()
        loss = model(Xtrain, Ytrain)
        loss.backward()
        optimizer.step()
        if verbose and epoch % 100 == 0:
            print(f"  Epoch {epoch}, loss: {loss.item()}")

    final_m = np.copy(model.m.detach().numpy())
    final_si = np.copy(model.si.detach().numpy())

    print("\n  misclassification rate : ", misclassification_rate(Xtest, Ytest, final_m, final_si))

    if loss_type == 'hinge':
        print("  hinge misclassification rate : ", hinge_misclassification_rate(Xtest, Ytest, final_m, final_si))
    elif loss_type == 'exp':
        print("  exp misclassification rate : ", exp_loss_misclassification_rate(Xtest, Ytest, final_m, final_si))

    if loss_type != 'exp':
        bound = model.bound(Xtest, Ytest, c_x)
        print('  bound : ', bound.item() if type(bound) != int else bound)

## Tests on Data

### On Breast Cancer Dataset

In [8]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
y = y.replace({'M': 1, 'B': -1})
Y = torch.tensor(y.values, dtype=torch.float32)

# normalizing the data
X = X.values
m = np.mean(X, axis=0)
v = np.std(X, axis=0)
X = (X - m) / v
X = np.c_[np.ones(X.shape[0]), X]
X = torch.tensor(X, dtype=torch.float32)

N = X.shape[0]
n = int(0.8 * N)

Xtrain, Xtest = X[:n], X[n:]
Ytrain, Ytest = Y[:n], Y[n:]
p = Xtrain.shape[1]

training(Xtrain, Ytrain, Xtest, Ytest, loss_type = '01', n_epochs = 1000, lr=0.01, m = None, si = None, divide_by_c_x=False, verbose = False)
training(Xtrain, Ytrain, Xtest, Ytest, loss_type = 'hinge', n_epochs = 1000, lr=0.01, m = None, si = None, divide_by_c_x=False,  verbose = False)
training(Xtrain, Ytrain, Xtest, Ytest, loss_type = 'exp', n_epochs = 10_000, lr=0.01, m = None, si = None, divide_by_c_x=False,verbose = False)

print("\nComparaison with logistice regression and SVM : ")
compare_logisitic(Xtrain, Ytrain, Xtest, Ytest)
compare_SVM(Xtrain, Ytrain, Xtest, Ytest)


 Training with model 01 ...


100%|██████████| 1000/1000 [00:04<00:00, 212.69it/s]



  misclassification rate :  0.18421052631578946
  bound :  0.04322889819741249

 Training with model hinge ...


100%|██████████| 1000/1000 [00:04<00:00, 221.14it/s]



  misclassification rate :  0.017543859649122806
  hinge misclassification rate :  0.03508771929824561
  bound :  -1

 Training with model exp ...


100%|██████████| 10000/10000 [00:18<00:00, 533.07it/s]



  misclassification rate :  0.08771929824561403
  exp misclassification rate :  0.5740550891791268

Comparaison with logistice regression and SVM : 
Logistic regression : 
error logistic regression :  0.3616497383810403
error hinge loss :  0.7232995
exponential loss :  1.2179021

SVM : 
error SVM :  0.35687903970452445
error hinge loss :  0.71375805
exponential loss :  1.2066889


### On Scept Heart Dataset

In [9]:
spect_heart = fetch_ucirepo(id=95) 
 
X = spect_heart.data.features 
y = spect_heart.data.targets 
y = y.replace({0: -1})
Y = torch.tensor(y.values, dtype=torch.float32)

# normalizing the data
X = X.values
m = np.mean(X, axis=0)
v = np.std(X, axis=0)
X = (X - m) / v
X = np.c_[np.ones(X.shape[0]), X]
X = torch.tensor(X, dtype=torch.float32)

N = X.shape[0]
n = int(0.8 * N)

Xtrain, Xtest = X[:n], X[n:]
Ytrain, Ytest = Y[:n], Y[n:]
p = Xtrain.shape[1]

training(Xtrain, Ytrain, Xtest, Ytest, loss_type = '01', n_epochs = 1000, lr=0.01, m = None, si = None, divide_by_c_x=False, verbose = False)
training(Xtrain, Ytrain, Xtest, Ytest, loss_type = 'hinge', n_epochs = 1000, lr=0.01, m = None, si = None, divide_by_c_x=False, verbose = False)
training(Xtrain, Ytrain, Xtest, Ytest, loss_type = 'exp', n_epochs = 10_000, lr=0.01, m = None, si = None, divide_by_c_x=False, verbose = False)

print("\nComparaison with logistice regression and SVM : ")
compare_logisitic(Xtrain, Ytrain, Xtest, Ytest)
compare_SVM(Xtrain, Ytrain, Xtest, Ytest)


 Training with model 01 ...


100%|██████████| 1000/1000 [00:03<00:00, 253.87it/s]



  misclassification rate :  0.3888888888888889
  bound :  0.0750739648938179

 Training with model hinge ...


100%|██████████| 1000/1000 [00:04<00:00, 219.44it/s]



  misclassification rate :  0.14814814814814814
  hinge misclassification rate :  0.2962962962962963
  bound :  -1

 Training with model exp ...


100%|██████████| 10000/10000 [00:18<00:00, 550.28it/s]


  misclassification rate :  0.12962962962962962
  exp misclassification rate :  0.6725612321161316

Comparaison with logistice regression and SVM : 
Logistic regression : 
error logistic regression :  0.3683127572016461
error hinge loss :  0.7366255
exponential loss :  1.2335627

SVM : 
error SVM :  0.2777777777777778
error hinge loss :  0.5555556
exponential loss :  1.0207692





### On Students Dataset

In [10]:
# fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 
y.replace({'Graduate': 1, 'Dropout': -1, 'Enrolled':1}, inplace=True)
Y = torch.tensor(y.values, dtype=torch.float32)

# normalizing the data
X = X.values
m = np.mean(X, axis=0)
v = np.std(X, axis=0)
X = (X - m) / v
X = np.c_[np.ones(X.shape[0]), X]
X = torch.tensor(X, dtype=torch.float32)

N = X.shape[0]
n = int(0.8 * N)

Xtrain, Xtest = X[:n], X[n:]
Ytrain, Ytest = Y[:n], Y[n:]
p = Xtrain.shape[1]

training(Xtrain, Ytrain, Xtest, Ytest, loss_type = '01', n_epochs = 5000, lr=0.005, m = None, si = None, divide_by_c_x=False, verbose = False)
training(Xtrain, Ytrain, Xtest, Ytest, loss_type = 'hinge', n_epochs = 1000, lr=0.01, m = None, si = None, divide_by_c_x=False, verbose = False)
training(Xtrain, Ytrain, Xtest, Ytest, loss_type = 'exp', n_epochs = 10_000, lr=0.01, m = None, si = None, divide_by_c_x=False, verbose = False)

print("\nComparaison with logistice regression and SVM : ")
compare_logisitic(Xtrain, Ytrain, Xtest, Ytest)
compare_SVM(Xtrain, Ytrain, Xtest, Ytest)


 Training with model 01 ...


100%|██████████| 5000/5000 [00:32<00:00, 154.54it/s]



  misclassification rate :  0.3446327683615819
  bound :  0.013292035087943077

 Training with model hinge ...


100%|██████████| 1000/1000 [00:07<00:00, 132.18it/s]



  misclassification rate :  0.15254237288135594
  hinge misclassification rate :  0.3050847457627119
  bound :  -1

 Training with model exp ...


100%|██████████| 10000/10000 [00:38<00:00, 258.32it/s]



  misclassification rate :  0.327683615819209
  exp misclassification rate :  1.1380677940679451

Comparaison with logistice regression and SVM : 
Logistic regression : 
error logistic regression :  0.43170864055667274
error hinge loss :  0.86341727
exponential loss :  1.3825687

SVM : 
error SVM :  0.4415397874174088
error hinge loss :  0.8830796
exponential loss :  1.4056758
