-
Notifications
You must be signed in to change notification settings - Fork 119
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Some matlab code and python code for bayesian nn example
- Loading branch information
0 parents
commit b6dc08b
Showing
10 changed files
with
605 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Variational Gradient Descent (VGD) | ||
Implementation of the method Variational Gradient Descent for scalable Bayesian logistic regression and Bayesian neural network. | ||
|
||
There are folders "matlab" and "python" containing implementations of the code in "matlab" and in "python". | ||
--Matlab code contains an example of Bayesian Logistic Regression | ||
--Python code is used to reproduce our table for the example of Bayesian neural network. | ||
|
||
|
||
Installation | ||
--Our python code is based on Theano 0.8.2 | ||
|
||
Citation | ||
--[TO DO] | ||
|
||
% Copyright (c) 2016, Qiang Liu & Dilin Wang | ||
% All rights reserved. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
function [Akxy, info] = KSD_KL_gradxy(x, dlog_p, h) | ||
%%%%%%%%%%%%%%%%%%%%%% | ||
% Input: | ||
% -- x: particles, n*d matrix, where n is the number of particles and d is the dimension of x | ||
% -- dlog_p: a function handle, which returns the first order derivative of log p(x), n*d matrix | ||
% -- h: bandwidth. If h == -1, h is selected by the median trick | ||
|
||
% Output: | ||
% --Akxy: n*d matrix, \Phi(x) is our algorithm, which is a smooth | ||
% function that characterizes the perturbation direction | ||
% --info: kernel bandwidth | ||
|
||
% Copyright (c) 2016, Qiang Liu & Dilin Wang | ||
% All rights reserved. | ||
%%%%%%%%%%%%%%%%%%%%%% | ||
|
||
if nargin < 3; h = -1; end % median trick as default | ||
|
||
[n, d] = size(x); | ||
|
||
%%%%%%%%%%%%%% Main part %%%%%%%%%% | ||
Sqy = dlog_p(x); | ||
|
||
% Using rbf kernel as default | ||
XY = x*x'; | ||
x2= sum(x.^2, 2); | ||
X2e = repmat(x2, 1, n); | ||
|
||
H = (X2e + X2e' - 2*XY); % calculate pairwise distance | ||
|
||
% median trick for bandwidth | ||
if h == -1 | ||
h = sqrt(0.5*median(H(:)) / log(n+1)); %rbf_dot has factor two in kernel | ||
end | ||
|
||
Kxy = exp(-H/(2*h^2)); % calculate rbf kernel | ||
|
||
|
||
dxKxy= -Kxy*x; | ||
sumKxy = sum(Kxy,2); | ||
for i = 1:d | ||
dxKxy(:,i)=dxKxy(:,i) + x(:,i).*sumKxy; | ||
end | ||
dxKxy = dxKxy/h^2; | ||
Akxy = (Kxy*Sqy + dxKxy)/n; | ||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|
||
info.bandwidth = h; | ||
|
||
return; | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
function [ acc, llh] = bayeslr_evaluation(theta, X_test, y_test ) | ||
% calculate the prediction error and log-likelihood | ||
% theta: M * d, logistic regression weights | ||
% X_test: N0 * d, input data | ||
% y_test: N0 * 1, contains the label (+1/-1) | ||
|
||
theta = theta(:,1:end-1); % only need w to evaluate accuracy and likelihood | ||
|
||
M = size(theta, 1); % number of particles | ||
n_test = length(y_test); % number of evaluation data points | ||
|
||
prob = zeros(n_test, M); | ||
for t = 1:M | ||
prob(:, t) = ones(n_test,1) ./ (1 + exp( y_test.* sum(-repmat(theta(t,:), n_test, 1) .* X_test, 2))); | ||
end | ||
prob = mean(prob, 2); | ||
acc = mean(prob > 0.5); | ||
llh = mean(log(prob)); | ||
|
||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
function master_stepsize_star = cv_search_stepsize(X, y, theta0, dlog_p) | ||
%%%%%%%%%%%%% | ||
% In practice, we need to tune the general learning rate for adagrad. | ||
% we exhaustive search over specified parameter values for VGD. | ||
%%%%%%%%%%%%% | ||
|
||
% adagrad master stepsize | ||
master_stepsize_grid = [1e0, 1e-1, 5e-2, 1e-2, 5e-3, 1e-3, 5e-4, 1e-4, 5e-5, 1e-5, 5e-6, 1e-6]; | ||
|
||
max_iter = 1000; | ||
|
||
% randomly partition 20% dataset for validation | ||
validation_ratio = 0.2; N = size(X,1); | ||
validation_idx = randperm(N, round(validation_ratio*N)); train_idx = setdiff(1:N, validation_idx); | ||
|
||
X_train = X(train_idx, :); y_train = y(train_idx); | ||
X_validation = X(validation_idx, :); y_validation = y(validation_idx); | ||
|
||
best_acc = 0; master_stepsize_star = 0.1; | ||
|
||
dlog_p_cross_validation = @(theta) dlog_p(theta, X_train, y_train); | ||
|
||
% grid parameters search strategy | ||
for master_stepsize = master_stepsize_grid | ||
theta = vgd(theta0, dlog_p_cross_validation, max_iter, master_stepsize); | ||
[acc, ~] = bayeslr_evaluation(theta, X_validation, y_validation); | ||
if acc > best_acc | ||
best_acc = acc; | ||
master_stepsize_star = master_stepsize; | ||
end | ||
fprintf('master_stepsize = %f, current acc = %f, best acc = %f, best master_stepsize %f \n', master_stepsize, acc, best_acc, master_stepsize_star); | ||
end | ||
|
||
end | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
clear | ||
|
||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
% Sample code to reproduce our results of bayesian logistic regression | ||
% Covertype dataset is available at: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/ | ||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
|
||
M = 100; % number of particles | ||
|
||
% we partition the data into 80% for training and 20% for testing | ||
train_ratio = 0.8; | ||
max_iter = 6000; | ||
|
||
% build up training and testing dataset | ||
load covtype.mat; | ||
X = covtype(:,2:end); y = covtype(:,1); y(y==2) = -1; | ||
|
||
X = [X, ones(size(X,1),1)]; % the bias parameter is absorbed by including 1 as an entry in x | ||
[N, d] = size(X); D = d+1; % w and alpha | ||
|
||
% building training and testing dataset | ||
train_idx = randperm(N, round(train_ratio*N)); test_idx = setdiff(1:N, train_idx); | ||
X_train = X(train_idx, :); y_train = y(train_idx); | ||
X_test = X(test_idx, :); y_test = y(test_idx); | ||
|
||
n_train = length(train_idx); n_test = length(test_idx); | ||
|
||
% example of bayesian logistic regression | ||
batchsize = 100; % subsampled mini-batch size | ||
a0 = 1; b0 = .01; % hyper-parameters | ||
dlog_p = @(theta, X, y) dlog_p_lr(theta, X, y, batchsize, a0, b0); % returns the first order derivative of the posterior distribution | ||
|
||
% initlization for particles using the prior distribution | ||
alpha0 = gamrnd(a0, b0, M, 1); theta0 = zeros(M, D); | ||
for i = 1:M | ||
theta0(i,:) = [normrnd(0, sqrt((1/alpha0(i))), 1, d), log(alpha0(i))]; % w and log(alpha) | ||
end | ||
|
||
% our variational gradient descent algorithm | ||
|
||
% Searching best master_stepsize using a development set | ||
% master_stepsize = cv_search_stepsize(X_train, y_train, theta0, dlog_p); | ||
master_stepsize = 0.05; | ||
|
||
tic | ||
dlog_p = @(theta)dlog_p(theta, X_train, y_train); % fix training set | ||
theta_vgd = vgd(theta0, dlog_p, max_iter, master_stepsize); | ||
time = toc; | ||
|
||
% evaluation | ||
[acc_vgd, llh_vgd] = bayeslr_evaluation(theta_vgd, X_test, y_test); | ||
fprintf('Result of VGD: testing accuracy: %f; testing loglikelihood: %f\n', acc_vgd, llh_vgd); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
function dlog_p = dlog_p_lr(theta, X, Y, batchsize, a0, b0) | ||
%%%%%%% | ||
% Output: First order derivative of Bayesian logistic regression. | ||
|
||
% The inference is applied on posterior p(theta|X, Y) with theta = [w, log(alpha)], | ||
% where p(theta|X, Y) is the bayesian logistic regression | ||
% We use the same settings as http://icml.cc/2012/papers/360.pdf | ||
|
||
% When the number of observations is very huge, computing the derivative of | ||
% log p(x) could be the major computation bottleneck. We can conveniently | ||
% address this problem by approximating with subsampled mini-batches | ||
|
||
% Input: | ||
% -- theta: a set of particles, M*d matrix (M is the number of particles) | ||
% -- X, Y: observations, where X is the feature matrix and Y contains | ||
% target label | ||
% -- batchsize, sub-sampling size of each batch;batchsize = -1, calculating the derivative exactly | ||
% -- a0, b0: hyper-parameters | ||
%%%%%%% | ||
|
||
[N, ~] = size(X); % N is the number of total observations | ||
|
||
if nargin < 4; batchsize = min(N, 100); end % default batch size 100 | ||
if nargin < 4; a0 = 1; end | ||
if nargin < 5; b0 = 1; end | ||
|
||
if batchsize > 0 | ||
ridx = randperm(N, batchsize); | ||
X = X(ridx,:); Y = Y(ridx,:); % stochastic version | ||
end | ||
|
||
w = theta(:, 1:end-1); %logistic weights | ||
alpha = exp(theta(:,end)); % the last column is logalpha | ||
D = size(w, 2); | ||
|
||
wt = (alpha/2).*(sum(w.*w, 2)); | ||
y_hat = 1./(1+exp(-X*w')); | ||
|
||
dw_data = ((repmat(Y,1,size(theta,1))+1)/2 - y_hat)' * X; % Y \in {-1,1} | ||
dw_prior = - repmat(alpha,1,D) .* w; | ||
dw = dw_data * N /size(X,1) + dw_prior; %re-scale | ||
|
||
dalpha = D/2 - wt + (a0-1) - b0.*alpha + 1; %the last term is the jacobian term | ||
|
||
dlog_p = [dw, dalpha]; % first order derivative | ||
|
||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
function theta = vgd(theta0, dlog_p, max_iter, master_stepsize, h, auto_corr, method) | ||
%%%%%%%% | ||
% Bayesian Inference via Variational Gradient Descent | ||
|
||
% input: | ||
% -- theta0: initialization of particles, m * d matrix (m is the number of particles, d is the dimension) | ||
% -- dlog_p: function handle of first order derivative of log p(x) | ||
% -- max_iter: maximum iterations | ||
% -- master_stepsize: the general learning rate for adagrad | ||
% -- h/bandwidth: bandwidth for rbf kernel. Using median trick as default | ||
% -- auto_corr: momentum term | ||
% -- method: use adagrad to select the best \epsilon | ||
|
||
% output: | ||
% -- theta: a set of particles that approximates p(x) | ||
|
||
% Copyright (c) 2016, Qiang Liu & Dilin Wang | ||
% All rights reserved. | ||
%%%%%%%% | ||
|
||
if nargin < 4; master_stepsize = 0.1; end; | ||
|
||
% for the following parameters, we always use the default settings | ||
if nargin < 5; h = -1; end; | ||
if nargin < 6; auto_corr = 0.9; end; | ||
if nargin < 7; method = 'adagrad'; end; | ||
|
||
switch lower(method) | ||
|
||
case 'adagrad' | ||
%% AdaGrad with momentum | ||
theta = theta0; | ||
|
||
fudge_factor = 1e-6; | ||
historial_grad = 0; | ||
|
||
for iter = 1:max_iter | ||
grad = KSD_KL_gradxy(theta, dlog_p, h); %\Phi(theta) | ||
if historial_grad == 0 | ||
historial_grad = historial_grad + grad.^2; | ||
else | ||
historial_grad = auto_corr * historial_grad + (1 - auto_corr) * grad.^2; | ||
end | ||
adj_grad = grad ./ (fudge_factor + sqrt(historial_grad)); | ||
theta = theta + master_stepsize * adj_grad; % update | ||
end | ||
|
||
otherwise | ||
error('wrong method'); | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Copyright 2016 Qiang Liu & Dilin Wang | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. |
Oops, something went wrong.