In [1]:
import numpy as np
import math
import torch
from torch import nn
from torch import functional as F
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
from livelossplot import PlotLosses
from Inference.Variational import MeanFieldVariationalDistribution
from Inference.VariationalBoosting import MeanFieldVariationalMixtureDistribution
from Tools.NNtools import *

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [3]:
# load data and make it in proper shape
data = torch.load('Data/foong_data.pt')
x_data = data[0].to(device)
y_data = data[1].to(device)
y_data = y_data.unsqueeze(-1)

In [4]:
model = nn.Sequential( nn.Linear(1, 50),
                       nn.Tanh(), 
                       nn.Linear(50, 1),
                     ).to(device)

In [5]:
param_count = get_param(model).shape[0]

In [6]:
flip_parameters_to_tensors(model)

In [7]:
prior = MeanFieldVariationalDistribution(param_count, sigma=1.0, device=device)
prior.mu.requires_grad = False
prior.rho.requires_grad = False

In [8]:
def logprior(x):
    return prior.log_prob(x)

In [9]:
def loglikelihood(theta, model, x, y, sigma_noise):
    def _log_norm(x, mu, std):
        return -0.5 * torch.log(2*np.pi*std**2) -(0.5 * (1/(std**2))* (x-mu)**2)
    #set_param(model, theta)
    set_all_parameters(model, theta)
    y_pred = model(x)
    L = _log_norm(y_pred, y, torch.tensor([sigma_noise],device=device))
    return torch.sum(L).unsqueeze(-1)

In [10]:
def logposterior(theta, model, x, y, sigma_noise):
    return logprior(theta) + loglikelihood(theta, model, x, y, sigma_noise)

In [11]:
def show_variational_posterior_predictive(q):
    x_test = torch.linspace(-2.0, 2.0).unsqueeze(1).to(device)
    # Sampling the distribution over Neural Networks 1000 times, and plotting with transparency to make it appear as a smooth distribution
    fig, ax = plt.subplots()
    fig.set_size_inches(11.7, 8.27)
    plt.xlim(-2, 2) 
    plt.ylim(-4, 4) 
    plt.scatter(x_data.cpu(), y_data.cpu())
    for _ in range(1000):
        z = q.sample()
        set_all_parameters(model, z)
        y_test = model(x_test)

        plt.plot(x_test.detach().cpu().numpy(), y_test.squeeze(0).detach().cpu().numpy(), alpha=0.05, linewidth=1, color='lightblue')

In [12]:
def MAP():
    theta = torch.nn.Parameter( prior.sample(), requires_grad=True)

    optimizer = torch.optim.Adam([theta], lr=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=50, factor=0.5)

    for t in range(1000):
        optimizer.zero_grad()

        L = -torch.mean(logposterior(theta, model, x_data, y_data, sigma_noise=0.1))
        L.backward()

        learning_rate = optimizer.param_groups[0]['lr']

        scheduler.step(L.detach().clone().cpu().numpy())
        optimizer.step()

        if learning_rate < 0.001:
            break
    return theta.detach().clone()

In [None]:
eMAP = []
for k in range(10):
    print(k)
    theta = MAP()
    eMAP.append(theta)

In [13]:
import ray
import time
import setproctitle

# Start Ray.
ray.init()

2020-01-09 22:03:00,157	INFO resource_spec.py:216 -- Starting Ray with 93.51 GiB memory available for workers and up to 18.63 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '192.168.1.197',
 'redis_address': '192.168.1.197:32072',
 'object_store_address': '/tmp/ray/session_2020-01-09_22-03-00_155783_22568/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-01-09_22-03-00_155783_22568/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2020-01-09_22-03-00_155783_22568'}

In [14]:
@ray.remote
def f(x):
    return MAP()

# Start 4 tasks in parallel.
result_ids = []
for i in range(10):
    result_ids.append(f.remote(i))

results = ray.get(result_ids)  # [0, 1, 2, 3]

In [None]:
results

In [None]:
ray.shutdown()

In [None]:
eMAP = results

In [None]:
components = []

In [None]:
for k in range(len(eMAP)):
    q_new = MeanFieldVariationalDistribution(param_count, sigma=0.001, device=device)
    q_new.mu = nn.Parameter(eMAP[k].squeeze(0).to(device), requires_grad=True) 
    components.append(q_new)

In [None]:
with torch.no_grad():
    proportions = torch.ones([len(eMAP)], requires_grad=True, device=device)/len(eMAP)

In [None]:
q = MeanFieldVariationalMixtureDistribution(proportions, components, device=device)

In [None]:
[c.rho.detach_().requires_grad_(True) for c in q.components]

In [None]:
[c.mu.detach_().requires_grad_(True) for c in q.components]

In [None]:
q.proportions

# Variational Distribution #

In [None]:
n_samples_ELBO = 200

In [None]:
optimizer = torch.optim.Adam( [c.mu for c in q.components]+[c.rho for c in q.components], lr=0.01, betas=(0.999,0.999))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=500, factor=0.8)

In [None]:
#optimizer = torch.optim.Adam( [c.mu for c in q.components]+[c.rho for c in q.components]+[q.proportions], lr=0.1)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=200, factor=0.5)

In [None]:
%matplotlib inline
liveloss = PlotLosses()
for t in range(10000):
    logs = {}
    optimizer.zero_grad()

    Ln = []
    Z = q.sample(n_samples_ELBO)
    MU = torch.stack([c.mu for c in q.components])
    SIGMA = torch.stack([c.sigma for c in q.components])
    A_ = -0.5*torch.log(2*np.pi*SIGMA**2)
    B_ = (SIGMA**2)
    for j in range(n_samples_ELBO):
        z = Z[j,:].unsqueeze(0)
        #LQ = q.log_prob(z)

        P = A_ - ( 0.5*(MU-z)**2)/B_ 
        LQ = torch.logsumexp(torch.log(q.proportions) + P.sum(dim=1), dim=0)
        
        LP = logposterior(z, model, x_data, y_data, sigma_noise=0.1)
        Ln.append(LQ - LP)

    L = torch.stack(Ln).mean()
    L.backward()
    
    learning_rate = optimizer.param_groups[0]['lr']
    
    scheduler.step(L.detach().clone().cpu().numpy())
    logs['ELBO'] = L.detach().clone().cpu().numpy()
    logs['learning rate'] = learning_rate
    liveloss.update(logs)
    
    if t % 20 == 0:
        liveloss.draw()

    optimizer.step()
    
    with torch.no_grad():
        torch.abs_(q.proportions)
        q.proportions = q.proportions / q.proportions.sum()
    q.proportions.detach_().requires_grad_(True)

    if learning_rate < 0.001:
        break

In [None]:
def _log_norm(x, mu, std):
        return -0.5 * torch.log(2*np.pi*std**2) -(0.5 * (1/(std**2))* (x-mu)**2)

In [None]:
q.proportions

In [None]:
show_variational_posterior_predictive(q)

In [None]:
q.proportions

In [None]:
L = q.proportions.sum()

In [None]:
L.backward()

In [None]:
q.requires_grad

In [None]:
Ln = []
Z = q.sample(n_samples_ELBO)
MU = torch.stack([c.mu for c in q.components])
SIGMA = torch.stack([c.sigma for c in q.components])
A_ = -0.5*torch.log(2*np.pi*SIGMA**2)
B_ = (SIGMA**2)
for j in range(n_samples_ELBO):
    z = Z[j,:].unsqueeze(0)
    #LQ = q.log_prob(z)

    P = A_ - ( 0.5*(MU-z)**2)/B_ 
    LQ = torch.logsumexp(torch.log(q.proportions) + P.sum(dim=1), dim=0)

    LP = logposterior(z, model, x_data, y_data, sigma_noise=0.1)
    Ln.append(LQ - LP)

L = torch.stack(Ln).mean()
L.backward()