Skip to content

Commit

Permalink
vgd 1.0
Browse files Browse the repository at this point in the history
Some matlab code and python code for bayesian nn example
  • Loading branch information
DartML committed Jul 18, 2016
0 parents commit b6dc08b
Show file tree
Hide file tree
Showing 10 changed files with 605 additions and 0 deletions.
Empty file added .Rapp.history
Empty file.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Variational Gradient Descent (VGD)
Implementation of the method Variational Gradient Descent for scalable Bayesian logistic regression and Bayesian neural network.

There are folders "matlab" and "python" containing implementations of the code in "matlab" and in "python".
--Matlab code contains an example of Bayesian Logistic Regression
--Python code is used to reproduce our table for the example of Bayesian neural network.


Installation
--Our python code is based on Theano 0.8.2

Citation
--[TO DO]

% Copyright (c) 2016, Qiang Liu & Dilin Wang
% All rights reserved.
52 changes: 52 additions & 0 deletions matlab/KSD_KL_gradxy.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
function [Akxy, info] = KSD_KL_gradxy(x, dlog_p, h)
%%%%%%%%%%%%%%%%%%%%%%
% Input:
% -- x: particles, n*d matrix, where n is the number of particles and d is the dimension of x
% -- dlog_p: a function handle, which returns the first order derivative of log p(x), n*d matrix
% -- h: bandwidth. If h == -1, h is selected by the median trick

% Output:
% --Akxy: n*d matrix, \Phi(x) is our algorithm, which is a smooth
% function that characterizes the perturbation direction
% --info: kernel bandwidth

% Copyright (c) 2016, Qiang Liu & Dilin Wang
% All rights reserved.
%%%%%%%%%%%%%%%%%%%%%%

if nargin < 3; h = -1; end % median trick as default

[n, d] = size(x);

%%%%%%%%%%%%%% Main part %%%%%%%%%%
Sqy = dlog_p(x);

% Using rbf kernel as default
XY = x*x';
x2= sum(x.^2, 2);
X2e = repmat(x2, 1, n);

H = (X2e + X2e' - 2*XY); % calculate pairwise distance

% median trick for bandwidth
if h == -1
h = sqrt(0.5*median(H(:)) / log(n+1)); %rbf_dot has factor two in kernel
end

Kxy = exp(-H/(2*h^2)); % calculate rbf kernel


dxKxy= -Kxy*x;
sumKxy = sum(Kxy,2);
for i = 1:d
dxKxy(:,i)=dxKxy(:,i) + x(:,i).*sumKxy;
end
dxKxy = dxKxy/h^2;
Akxy = (Kxy*Sqy + dxKxy)/n;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

info.bandwidth = h;

return;
end

21 changes: 21 additions & 0 deletions matlab/bayeslr_evaluation.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
function [ acc, llh] = bayeslr_evaluation(theta, X_test, y_test )
% calculate the prediction error and log-likelihood
% theta: M * d, logistic regression weights
% X_test: N0 * d, input data
% y_test: N0 * 1, contains the label (+1/-1)

theta = theta(:,1:end-1); % only need w to evaluate accuracy and likelihood

M = size(theta, 1); % number of particles
n_test = length(y_test); % number of evaluation data points

prob = zeros(n_test, M);
for t = 1:M
prob(:, t) = ones(n_test,1) ./ (1 + exp( y_test.* sum(-repmat(theta(t,:), n_test, 1) .* X_test, 2)));
end
prob = mean(prob, 2);
acc = mean(prob > 0.5);
llh = mean(log(prob));

end

37 changes: 37 additions & 0 deletions matlab/cv_search_stepsize.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
function master_stepsize_star = cv_search_stepsize(X, y, theta0, dlog_p)
%%%%%%%%%%%%%
% In practice, we need to tune the general learning rate for adagrad.
% we exhaustive search over specified parameter values for VGD.
%%%%%%%%%%%%%

% adagrad master stepsize
master_stepsize_grid = [1e0, 1e-1, 5e-2, 1e-2, 5e-3, 1e-3, 5e-4, 1e-4, 5e-5, 1e-5, 5e-6, 1e-6];

max_iter = 1000;

% randomly partition 20% dataset for validation
validation_ratio = 0.2; N = size(X,1);
validation_idx = randperm(N, round(validation_ratio*N)); train_idx = setdiff(1:N, validation_idx);

X_train = X(train_idx, :); y_train = y(train_idx);
X_validation = X(validation_idx, :); y_validation = y(validation_idx);

best_acc = 0; master_stepsize_star = 0.1;

dlog_p_cross_validation = @(theta) dlog_p(theta, X_train, y_train);

% grid parameters search strategy
for master_stepsize = master_stepsize_grid
theta = vgd(theta0, dlog_p_cross_validation, max_iter, master_stepsize);
[acc, ~] = bayeslr_evaluation(theta, X_validation, y_validation);
if acc > best_acc
best_acc = acc;
master_stepsize_star = master_stepsize;
end
fprintf('master_stepsize = %f, current acc = %f, best acc = %f, best master_stepsize %f \n', master_stepsize, acc, best_acc, master_stepsize_star);
end

end



52 changes: 52 additions & 0 deletions matlab/demo_bayeslr.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
clear

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Sample code to reproduce our results of bayesian logistic regression
% Covertype dataset is available at: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

M = 100; % number of particles

% we partition the data into 80% for training and 20% for testing
train_ratio = 0.8;
max_iter = 6000;

% build up training and testing dataset
load covtype.mat;
X = covtype(:,2:end); y = covtype(:,1); y(y==2) = -1;

X = [X, ones(size(X,1),1)]; % the bias parameter is absorbed by including 1 as an entry in x
[N, d] = size(X); D = d+1; % w and alpha

% building training and testing dataset
train_idx = randperm(N, round(train_ratio*N)); test_idx = setdiff(1:N, train_idx);
X_train = X(train_idx, :); y_train = y(train_idx);
X_test = X(test_idx, :); y_test = y(test_idx);

n_train = length(train_idx); n_test = length(test_idx);

% example of bayesian logistic regression
batchsize = 100; % subsampled mini-batch size
a0 = 1; b0 = .01; % hyper-parameters
dlog_p = @(theta, X, y) dlog_p_lr(theta, X, y, batchsize, a0, b0); % returns the first order derivative of the posterior distribution

% initlization for particles using the prior distribution
alpha0 = gamrnd(a0, b0, M, 1); theta0 = zeros(M, D);
for i = 1:M
theta0(i,:) = [normrnd(0, sqrt((1/alpha0(i))), 1, d), log(alpha0(i))]; % w and log(alpha)
end

% our variational gradient descent algorithm

% Searching best master_stepsize using a development set
% master_stepsize = cv_search_stepsize(X_train, y_train, theta0, dlog_p);
master_stepsize = 0.05;

tic
dlog_p = @(theta)dlog_p(theta, X_train, y_train); % fix training set
theta_vgd = vgd(theta0, dlog_p, max_iter, master_stepsize);
time = toc;

% evaluation
[acc_vgd, llh_vgd] = bayeslr_evaluation(theta_vgd, X_test, y_test);
fprintf('Result of VGD: testing accuracy: %f; testing loglikelihood: %f\n', acc_vgd, llh_vgd);
48 changes: 48 additions & 0 deletions matlab/dlog_p_lr.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
function dlog_p = dlog_p_lr(theta, X, Y, batchsize, a0, b0)
%%%%%%%
% Output: First order derivative of Bayesian logistic regression.

% The inference is applied on posterior p(theta|X, Y) with theta = [w, log(alpha)],
% where p(theta|X, Y) is the bayesian logistic regression
% We use the same settings as http://icml.cc/2012/papers/360.pdf

% When the number of observations is very huge, computing the derivative of
% log p(x) could be the major computation bottleneck. We can conveniently
% address this problem by approximating with subsampled mini-batches

% Input:
% -- theta: a set of particles, M*d matrix (M is the number of particles)
% -- X, Y: observations, where X is the feature matrix and Y contains
% target label
% -- batchsize, sub-sampling size of each batch;batchsize = -1, calculating the derivative exactly
% -- a0, b0: hyper-parameters
%%%%%%%

[N, ~] = size(X); % N is the number of total observations

if nargin < 4; batchsize = min(N, 100); end % default batch size 100
if nargin < 4; a0 = 1; end
if nargin < 5; b0 = 1; end

if batchsize > 0
ridx = randperm(N, batchsize);
X = X(ridx,:); Y = Y(ridx,:); % stochastic version
end

w = theta(:, 1:end-1); %logistic weights
alpha = exp(theta(:,end)); % the last column is logalpha
D = size(w, 2);

wt = (alpha/2).*(sum(w.*w, 2));
y_hat = 1./(1+exp(-X*w'));

dw_data = ((repmat(Y,1,size(theta,1))+1)/2 - y_hat)' * X; % Y \in {-1,1}
dw_prior = - repmat(alpha,1,D) .* w;
dw = dw_data * N /size(X,1) + dw_prior; %re-scale

dalpha = D/2 - wt + (a0-1) - b0.*alpha + 1; %the last term is the jacobian term

dlog_p = [dw, dalpha]; % first order derivative

end

51 changes: 51 additions & 0 deletions matlab/vgd.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
function theta = vgd(theta0, dlog_p, max_iter, master_stepsize, h, auto_corr, method)
%%%%%%%%
% Bayesian Inference via Variational Gradient Descent

% input:
% -- theta0: initialization of particles, m * d matrix (m is the number of particles, d is the dimension)
% -- dlog_p: function handle of first order derivative of log p(x)
% -- max_iter: maximum iterations
% -- master_stepsize: the general learning rate for adagrad
% -- h/bandwidth: bandwidth for rbf kernel. Using median trick as default
% -- auto_corr: momentum term
% -- method: use adagrad to select the best \epsilon

% output:
% -- theta: a set of particles that approximates p(x)

% Copyright (c) 2016, Qiang Liu & Dilin Wang
% All rights reserved.
%%%%%%%%

if nargin < 4; master_stepsize = 0.1; end;

% for the following parameters, we always use the default settings
if nargin < 5; h = -1; end;
if nargin < 6; auto_corr = 0.9; end;
if nargin < 7; method = 'adagrad'; end;

switch lower(method)

case 'adagrad'
%% AdaGrad with momentum
theta = theta0;

fudge_factor = 1e-6;
historial_grad = 0;

for iter = 1:max_iter
grad = KSD_KL_gradxy(theta, dlog_p, h); %\Phi(theta)
if historial_grad == 0
historial_grad = historial_grad + grad.^2;
else
historial_grad = auto_corr * historial_grad + (1 - auto_corr) * grad.^2;
end
adj_grad = grad ./ (fudge_factor + sqrt(historial_grad));
theta = theta + master_stepsize * adj_grad; % update
end

otherwise
error('wrong method');
end
end
13 changes: 13 additions & 0 deletions python/LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Copyright 2016 Qiang Liu & Dilin Wang

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Loading

0 comments on commit b6dc08b

Please sign in to comment.