### IMPORTS

In [1]:
from Inference import BBVI 

In [2]:
from Inference.Variational import MeanFieldVariationalDistribution

In [3]:
import torch
from torch import nn
from torch import functional as F
import matplotlib.pyplot as plt
import numpy as np
from livelossplot import PlotLosses

In [4]:
def _log_norm(x, mu, std):
        return -0.5 * torch.log(2*np.pi*std**2) -(0.5 * (1/(std**2))* (x-mu)**2)

In [5]:
# Find what device to work with
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [6]:
# load data and make it in proper shape
data = torch.load('data/foong_data.pt')
x_data = data[0].to(device)
y_data = data[1].to(device)
y_data = y_data.unsqueeze(-1)

In [7]:
import GPUtil

In [8]:
Net = BBVI.VariationalNetwork(input_size=1, output_size=1, layer_width=25, nb_layers=4, device=device)

In [9]:
GPUtil.GPU

GPUtil.GPUtil.GPU

In [10]:
deviceIDs = GPUtil.getAvailable()

In [11]:
deviceIDs

[0]

In [27]:
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  2% | 24% |


In [29]:
torch.cuda. (device=device)

(6, 1)

In [30]:
torch.cuda.memory_allocated(device=device)

57344

In [40]:
torch.cuda.memory_allocated(device=device)

79872

In [31]:
param_count = int(Net.count_parameters()/2)
q = MeanFieldVariationalDistribution(param_count, sigma=0.0001, device=device)
prior = MeanFieldVariationalDistribution(param_count, sigma=10.0, device=device)
prior.mu.requires_grad = False
prior.rho.requires_grad = False
q.rho.requires_grad = False
q.mu = nn.Parameter(prior.sample()[0,:].detach().clone().to(device), requires_grad=True) 

In [33]:
def logprior(x):
    return prior.log_prob(x)

In [34]:
def loglikelihood(x):
    Net.set_parameters(x)
    y_pred = Net(x_data)
    L = _log_norm(y_pred, y_data, torch.tensor([0.1],device=device))
    return torch.sum(L, dim=[1,2]).unsqueeze(-1)

In [36]:
def logposterior(x):
    return logprior(x) + loglikelihood(x)

In [37]:
n_samples_ELBO = 1000

In [39]:
optimizer = torch.optim.Adam(q.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=200, factor=0.5)

In [None]:
%matplotlib inline
liveloss = PlotLosses()
for t in range(10000):
    logs = {}
    optimizer.zero_grad()

    z = q.sample(n_samples_ELBO)
    LQ = q.log_prob(z)
    LP = logposterior(z)
    L = (LQ - LP).sum()/n_samples_ELBO

    L.backward()
    
    learning_rate = optimizer.param_groups[0]['lr']
    
    scheduler.step(L.detach().clone().cpu().numpy())
    logs['ELBO'] = L.detach().clone().cpu().numpy()
    logs['learning rate'] = learning_rate
    liveloss.update(logs)
    
    if t % 10 == 0:
        liveloss.draw()

    optimizer.step()
    
    if learning_rate < 0.0001:
        if n_samples_ELBO > 1000:
            break
        else:
            optimizer = torch.optim.Adam(q.parameters(), lr=0.001)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=200, factor=0.8)
            n_samples_ELBO = n_samples_ELBO*10
        

In [None]:
q.rho.requires_grad = True
q.mu.requires_grad = False

In [None]:
x_test = torch.linspace(-2.0, 2.0).unsqueeze(1).to(device)

In [None]:
# Sampling the distribution over Neural Networks 1000 times, and plotting with transparency to make it appear as a smooth distribution
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
plt.scatter(x_data.cpu(), y_data.cpu())
for _ in range(1000):
    z = prior.sample()
    Net.set_parameters(z)

    y_test = Net.forward(x_test)
    plt.plot(x_test.detach().cpu().numpy(), y_test.squeeze(0).detach().cpu().numpy(), alpha=0.05, linewidth=1, color='lightblue')