vgd 1.0

Some matlab code and python code for bayesian nn example
dilinwang820 · Jul 18, 2016 · b6dc08b · b6dc08b
commit b6dc08b
Show file tree

Hide file tree

Showing 10 changed files with 605 additions and 0 deletions.
diff --git a/.Rapp.history b/.Rapp.history
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+# Variational Gradient Descent (VGD)
+Implementation of the method Variational Gradient Descent for scalable Bayesian logistic regression and Bayesian neural network.
+
+There are folders "matlab" and "python" containing implementations of the code in "matlab" and in "python".
+  --Matlab code contains an example of Bayesian Logistic Regression 
+  --Python code is used to reproduce our table for the example of Bayesian neural network.
+
+
+Installation
+  --Our python code is based on Theano 0.8.2
+
+Citation
+  --[TO DO]
+
+% Copyright (c) 2016,  Qiang Liu & Dilin Wang
+% All rights reserved.
diff --git a/matlab/KSD_KL_gradxy.m b/matlab/KSD_KL_gradxy.m
@@ -0,0 +1,52 @@
+function [Akxy, info] = KSD_KL_gradxy(x, dlog_p, h)
+%%%%%%%%%%%%%%%%%%%%%%
+% Input:
+%    -- x: particles, n*d matrix, where n is the number of particles and d is the dimension of x 
+%    -- dlog_p: a function handle, which returns the first order derivative of log p(x), n*d matrix
+%    -- h: bandwidth. If h == -1, h is selected by the median trick
+
+% Output:
+%    --Akxy: n*d matrix, \Phi(x) is our algorithm, which is a smooth
+%    function that characterizes the perturbation direction
+%    --info: kernel bandwidth
+
+% Copyright (c) 2016,  Qiang Liu & Dilin Wang
+% All rights reserved.
+%%%%%%%%%%%%%%%%%%%%%%
+
+if nargin < 3; h = -1; end % median trick as default
+
+[n, d] = size(x);
+
+%%%%%%%%%%%%%% Main part %%%%%%%%%%
+Sqy = dlog_p(x);
+
+% Using rbf kernel as default
+XY = x*x';
+x2= sum(x.^2, 2);
+X2e = repmat(x2, 1, n);
+
+H = (X2e + X2e' - 2*XY); % calculate pairwise distance
+
+% median trick for bandwidth
+if h == -1
+    h = sqrt(0.5*median(H(:)) / log(n+1));   %rbf_dot has factor two in kernel
+end
+
+Kxy = exp(-H/(2*h^2));   % calculate rbf kernel
+
+
+dxKxy= -Kxy*x;
+sumKxy = sum(Kxy,2);
+for i = 1:d
+    dxKxy(:,i)=dxKxy(:,i) + x(:,i).*sumKxy;
+end
+dxKxy = dxKxy/h^2;
+Akxy = (Kxy*Sqy + dxKxy)/n;
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+info.bandwidth = h;
+
+return;
+end
+
diff --git a/matlab/bayeslr_evaluation.m b/matlab/bayeslr_evaluation.m
@@ -0,0 +1,21 @@
+function [ acc, llh] = bayeslr_evaluation(theta, X_test, y_test )
+% calculate the prediction error and log-likelihood
+% theta: M * d, logistic regression weights
+% X_test:  N0 * d, input data
+% y_test:  N0 * 1, contains the label (+1/-1)
+
+theta = theta(:,1:end-1); % only need w to evaluate accuracy and likelihood
+
+M = size(theta, 1);  % number of particles
+n_test = length(y_test); % number of evaluation data points
+
+prob = zeros(n_test, M);
+for t = 1:M
+      prob(:, t) = ones(n_test,1) ./ (1 + exp( y_test.* sum(-repmat(theta(t,:), n_test, 1) .* X_test, 2)));
+end
+prob = mean(prob, 2);
+acc = mean(prob > 0.5);
+llh = mean(log(prob));
+
+end
+
diff --git a/matlab/cv_search_stepsize.m b/matlab/cv_search_stepsize.m
@@ -0,0 +1,37 @@
+function master_stepsize_star = cv_search_stepsize(X, y, theta0, dlog_p)
+%%%%%%%%%%%%%
+% In practice, we need to tune the general learning rate for adagrad.
+% we exhaustive search over specified parameter values for VGD.
+%%%%%%%%%%%%%
+
+% adagrad master stepsize
+master_stepsize_grid = [1e0, 1e-1, 5e-2, 1e-2, 5e-3, 1e-3, 5e-4, 1e-4, 5e-5, 1e-5, 5e-6, 1e-6];
+
+max_iter = 1000;
+
+% randomly partition 20% dataset for validation
+validation_ratio = 0.2; N = size(X,1);
+validation_idx = randperm(N, round(validation_ratio*N));  train_idx = setdiff(1:N, validation_idx);
+
+X_train = X(train_idx, :); y_train = y(train_idx);
+X_validation = X(validation_idx, :); y_validation = y(validation_idx);
+
+best_acc = 0; master_stepsize_star = 0.1;
+
+dlog_p_cross_validation = @(theta) dlog_p(theta, X_train, y_train);
+
+% grid parameters search strategy
+for master_stepsize = master_stepsize_grid
+    theta = vgd(theta0, dlog_p_cross_validation, max_iter, master_stepsize);
+    [acc, ~] = bayeslr_evaluation(theta, X_validation, y_validation);
+    if acc > best_acc
+        best_acc = acc;
+        master_stepsize_star = master_stepsize;
+    end
+    fprintf('master_stepsize = %f, current acc = %f, best acc = %f, best master_stepsize %f \n', master_stepsize, acc, best_acc, master_stepsize_star);
+end
+
+end
+
+
+
diff --git a/matlab/demo_bayeslr.m b/matlab/demo_bayeslr.m
@@ -0,0 +1,52 @@
+clear
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Sample code to reproduce our results of bayesian logistic regression
+% Covertype dataset is available at: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+M = 100; % number of particles
+
+% we partition the data into 80% for training and 20% for testing
+train_ratio = 0.8;
+max_iter = 6000;
+
+% build up training and testing dataset
+load covtype.mat;
+X = covtype(:,2:end); y = covtype(:,1); y(y==2) = -1;
+
+X = [X, ones(size(X,1),1)];  % the bias parameter is absorbed by including 1 as an entry in x
+[N, d] = size(X); D = d+1; % w and alpha
+
+% building training and testing dataset
+train_idx = randperm(N, round(train_ratio*N));  test_idx = setdiff(1:N, train_idx);
+X_train = X(train_idx, :); y_train = y(train_idx);
+X_test = X(test_idx, :); y_test = y(test_idx);
+
+n_train = length(train_idx); n_test = length(test_idx);
+
+% example of bayesian logistic regression
+batchsize = 100; % subsampled mini-batch size
+a0 = 1; b0 = .01; % hyper-parameters
+dlog_p = @(theta, X, y) dlog_p_lr(theta, X, y, batchsize, a0, b0); % returns the first order derivative of the posterior distribution 
+
+% initlization for particles using the prior distribution
+alpha0 = gamrnd(a0, b0, M, 1); theta0 = zeros(M, D);
+for i = 1:M
+    theta0(i,:) = [normrnd(0, sqrt((1/alpha0(i))), 1, d), log(alpha0(i))]; % w and log(alpha)
+end
+
+% our variational gradient descent algorithm
+
+% Searching best master_stepsize using a development set
+% master_stepsize = cv_search_stepsize(X_train, y_train, theta0, dlog_p);
+master_stepsize = 0.05;  
+
+tic
+dlog_p  = @(theta)dlog_p(theta, X_train, y_train); % fix training set
+theta_vgd = vgd(theta0, dlog_p, max_iter, master_stepsize);
+time = toc;
+
+% evaluation
+[acc_vgd, llh_vgd] = bayeslr_evaluation(theta_vgd, X_test, y_test);
+fprintf('Result of VGD: testing accuracy: %f; testing loglikelihood: %f\n', acc_vgd, llh_vgd);
diff --git a/matlab/dlog_p_lr.m b/matlab/dlog_p_lr.m
@@ -0,0 +1,48 @@
+function dlog_p = dlog_p_lr(theta, X, Y, batchsize, a0, b0)
+%%%%%%%
+% Output: First order derivative of Bayesian logistic regression. 
+
+% The inference is applied on posterior p(theta|X, Y) with theta = [w, log(alpha)], 
+% where p(theta|X, Y) is the bayesian logistic regression 
+% We use the same settings as http://icml.cc/2012/papers/360.pdf
+
+% When the number of observations is very huge, computing the derivative of
+% log p(x) could be the major computation bottleneck. We can conveniently
+% address this problem by approximating with subsampled mini-batches
+
+% Input:
+%   -- theta: a set of particles, M*d matrix (M is the number of particles)
+%   -- X, Y: observations, where X is the feature matrix and Y contains
+%   target label
+%   -- batchsize, sub-sampling size of each batch;batchsize = -1, calculating the derivative exactly
+%   -- a0, b0: hyper-parameters
+%%%%%%%
+
+[N, ~] = size(X);  % N is the number of total observations
+
+if nargin < 4; batchsize = min(N, 100); end % default batch size 100
+if nargin < 4; a0 = 1; end
+if nargin < 5; b0 = 1; end
+
+if batchsize  > 0
+    ridx = randperm(N, batchsize);
+    X = X(ridx,:); Y = Y(ridx,:);  % stochastic version
+end
+
+w = theta(:, 1:end-1);  %logistic weights
+alpha = exp(theta(:,end)); % the last column is logalpha
+D = size(w, 2);
+
+wt = (alpha/2).*(sum(w.*w, 2));
+y_hat = 1./(1+exp(-X*w'));
+
+dw_data = ((repmat(Y,1,size(theta,1))+1)/2 - y_hat)' * X; % Y \in {-1,1}
+dw_prior = - repmat(alpha,1,D) .* w;
+dw = dw_data * N /size(X,1) + dw_prior; %re-scale
+
+dalpha = D/2 - wt + (a0-1) - b0.*alpha + 1;  %the last term is the jacobian term
+
+dlog_p = [dw, dalpha]; % first order derivative 
+
+end
+
diff --git a/matlab/vgd.m b/matlab/vgd.m
@@ -0,0 +1,51 @@
+function  theta = vgd(theta0, dlog_p, max_iter, master_stepsize, h, auto_corr, method)
+%%%%%%%%
+% Bayesian Inference via Variational Gradient Descent
+
+% input:
+%   -- theta0: initialization of particles, m * d matrix (m is the number of particles, d is the dimension)
+%   -- dlog_p: function handle of first order derivative of log p(x)
+%   -- max_iter: maximum iterations
+%   -- master_stepsize: the general learning rate for adagrad
+%   -- h/bandwidth: bandwidth for rbf kernel. Using median trick as default
+%   -- auto_corr: momentum term
+%   -- method: use adagrad to select the best \epsilon
+
+% output:
+%   -- theta: a set of particles that approximates p(x)
+
+% Copyright (c) 2016,  Qiang Liu & Dilin Wang
+% All rights reserved.
+%%%%%%%%
+
+if nargin < 4; master_stepsize = 0.1; end;
+
+% for the following parameters, we always use the default settings
+if nargin < 5; h = -1; end;
+if nargin < 6; auto_corr = 0.9; end;
+if nargin < 7; method = 'adagrad'; end;
+
+switch lower(method)
+
+    case 'adagrad'
+        %% AdaGrad with momentum
+        theta = theta0;
+
+        fudge_factor = 1e-6;
+        historial_grad = 0;
+
+        for iter = 1:max_iter
+            grad = KSD_KL_gradxy(theta, dlog_p, h);   %\Phi(theta)
+            if historial_grad == 0
+                historial_grad = historial_grad + grad.^2;
+            else
+                historial_grad = auto_corr * historial_grad + (1 - auto_corr) * grad.^2;
+            end
+            adj_grad = grad ./ (fudge_factor + sqrt(historial_grad));
+            theta = theta + master_stepsize * adj_grad; % update
+        end
+
+    otherwise
+        error('wrong method');
+end
+end
diff --git a/python/LICENSE.txt b/python/LICENSE.txt
@@ -0,0 +1,13 @@
+Copyright 2016 Qiang Liu & Dilin Wang
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.