/
gpr.m
365 lines (315 loc) · 11.1 KB
/
gpr.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
function varargout = gpr(x, y, varargin)
%GPR Gaussian process regression
%
% F = GPR(X, Y) returns a CHEBFUN F defined on [min(X),max(X)]
% representing the posterior mean of a Gaussian process with prior mean 0
% and squared exponential kernel
% k(x,x') = SIGMA^2*exp(-1/(2*L^2)*(x-x')^2).
% The default signal variance is SIGMA = max(abs(Y)). L is chosen such
% that it maximizes the log marginal likelihood (see eq. (2.30) from [1]).
% F matches Y at X.
%
% [F, FVAR] = GPR(X, Y) also returns a CHEBFUN representing an estimate
% of the variance in the posterior.
%
% [F, FVAR, SAMPLES] = GPR(X, Y, 'samples', N) also computes N
% independent samples from the posterior distribution, returning them
% as the N columns of the quasimatrix SAMPLES.
%
% [...] = GPR(...,'domain', DOM) computes the results on the domain
% DOM = [A, B].
%
% [...] = GPR(...,'trig') uses a periodic version of the squared
% exponential kernel (see eq. (4.31) from [1]), namely
% k(x,x') = SIGMA^2*exp(-2/L^2*sin(pi*(x-x')/P)^2),
% where P is the period length, corresponding to the size of the
% approximation domain. If the domain is not specified, it is chosen
% to be [min(X) max(X) +.1*(max(X)-min(X))] to account for the fact
% that the Y values might not be identical at min(X) and max(X).
%
% [...] = GPR(...,'sigma', SIGMA) specifies the signal variance of
% the kernel function.
% [...] = GPR(...,'L', L) specifies the length scale parameter of the
% kernel function.
%
% [...] = GPR(...,'noise', sigmaY) specifies that the input is noisy
% i.i.d. with noise distribution N(0,sigmaY^2). The kernel function
% takes this into account by updating its values at the sample points. It
% becomes
% k'(x,x') = k(x,x') + sigmaY^2*delta_xx',
% where delta_xx' is the Kronecker delta function taking one if x=x' and
% zero otherwise (see eq. (2.20) from [1]).
%
%
% Examples:
%
% rng(1)
% n = 10; x = -2 + 4*rand(n,1);
% y = sin(exp(x));
% [f,fvar,smpl] = gpr(x,y,'domain',[-2,2],'samples',3);
% plot(f), hold on
% plot(smpl,'color',[.8 .8 .8]), plot(x,y,'.k','markersize',14),
% hold off
%
% % add noise to the inputs
% y = y + .1*randn(n,1);
% f2 = gpr(x,y,'domain',[-2,2],'noise',.1);
% plot(f2), hold on
% plot(x,y,'.k','markersize',14), hold off
%
% Reference:
%
% [1] C. E. Rasmussen & C. K. I. Williams, "Gaussian Processes
% for Machine Learning", MIT Press, 2006
%
% Copyright 2017 by The University of Oxford and The Chebfun Developers.
% See http://www.chebfun.org/ for Chebfun information.
x = x(:); y = y(:);
scalingFactor = 1;
sigmaGiven = 0;
for k = 1:length(varargin)
if ( strcmpi('sigma', varargin{k}) )
sigmaGiven = 1;
end
end
if ~isempty(y) && ~sigmaGiven
scalingFactor = max(abs(y));
end
yn = y/scalingFactor;
opts = parseInputs(x, yn, varargin{:});
if ~opts.sigmaGiven
opts.sigma = scalingFactor;
end
% Construct the kernel matrix corresponding to x. For the moment,
% we assume a Gaussian squared exponential kernel. (see for
% instance eq. (2.31) from [1])
if ~isempty(x)
n = length(x);
r = repmat(x,1,n) - repmat(x',n,1);
if opts.trig
K = opts.sigma^2*exp(-2/(opts.lenScale^2) * ...
sin(pi/(opts.dom(end)-opts.dom(1))*r).^2) + ...
opts.sigmaY^2*eye(n);
else
K = (opts.sigma^2)*exp(-1/(2*opts.lenScale^2)*r.^2) + ...
opts.sigmaY^2*eye(n);
end
% compute the Cholesky decomposition of K
if opts.sigmaY == 0
L = chol(K+1e-15*scalingFactor^2*n*eye(n), 'lower');
else
L = chol(K+opts.sigmaY^2*eye(n), 'lower');
end
% coefficients of the radial basis function expansion of the mean
alpha = L'\(L\y);
% constuct a Chebfun approximation for the posterior distribution mean
if opts.trig && ~opts.sigmaY
f = chebfun(@(z) mean(alpha, x, z, opts), opts.dom, 'trig', ...
'eps', 1e-12,'splitting','on');
else
f = chebfun(@(z) mean(alpha, x, z, opts), opts.dom, ...
'eps', 1e-12,'splitting','on');
end
% compute the predictive variance based on a large sample set
sampleSize = min(20*n,2000);
xSample = chebpts(sampleSize,opts.dom);
% check if we need to add the noise term in the covariance matrix
inX = ismember(xSample,x);
% construct the input matrices
rx = repmat(xSample,1,n) - repmat(x',sampleSize,1);
rxs = repmat(xSample,1,sampleSize) - repmat(xSample',sampleSize,1);
if opts.trig
Ks = opts.sigma^2*exp(-2/(opts.lenScale^2) * ...
sin(pi/(opts.dom(end)-opts.dom(1))*rx).^2) + ...
opts.sigmaY^2*(rx == 0);
Kss = opts.sigma^2*exp(-2/(opts.lenScale^2) * ...
sin(pi/(opts.dom(end)-opts.dom(1)) * rxs).^2) + ...
opts.sigmaY^2*diag(inX);
else
Ks = opts.sigma^2*exp(-1/(2*opts.lenScale^2) * rx.^2) + ...
opts.sigmaY^2*(rx == 0);
Kss = opts.sigma^2*exp(-1/(2*opts.lenScale^2) * rxs.^2) + ...
opts.sigmaY^2*diag(inX);
end
v = L\(Ks');
fvar = spdiags(Kss - v'*v, 0);
fvar = chebfun(fvar,opts.dom);
else % no data points given
% we are assuming a zero mean on the prior
f = chebfun(0,opts.dom);
fvar = chebfun(opts.sigma^2,opts.dom);
end
fvar = simplify(fvar);
% Take samples from the posterior and construct Chebfun representations
% of them. For the moment, just sample at a large number of points and
% construct Chebfun representations.
if ( opts.samples > 0 )
if ~isempty(x)
Ls = chol(Kss - v'*v + 1e-12*scalingFactor^2*n*eye(sampleSize),...
'lower');
fSample = repmat(f(xSample), 1, opts.samples) + ...
Ls*randn(sampleSize, opts.samples);
fSample = chebfun(fSample,opts.dom);
else
sampleSize = 1000;
if opts.trig
xSample = linspace(opts.dom(1),opts.dom(end),sampleSize)';
rxs = repmat(xSample,1,sampleSize) - ...
repmat(xSample',sampleSize,1);
Kss = opts.sigmaf^2*exp(-2/(opts.lenScale^2) * ...
sin(pi/diff(opts.dom) * rxs).^2);
Ls = chol(Kss + 1e-12*scalingFactor^2*eye(sampleSize),'lower');
else
xSample = chebpts(sampleSize,opts.dom);
rxs = repmat(xSample,1,sampleSize) - ...
repmat(xSample',sampleSize,1);
Kss = (opts.sigmaf^2)*exp(-1/(2*opts.lenScale^2)* rxs.^2);
Ls = chol(Kss + 1e-12*scalingFactor^2*eye(sampleSize),'lower');
end
fSample = repmat(f(xSample), 1, opts.samples) + ...
Ls*randn(sampleSize, opts.samples);
if opts.trig
fSample = chebfun(fSample,opts.dom,'trig');
else
fSample = chebfun(fSample,opts.dom);
end
end
varargout = {f, fvar, fSample};
else
varargout = {f, fvar};
end
end
function opts = parseInputs(x, y, varargin)
if length(x) ~= length(y)
error('CHEBFUN:CHEBFUN:gpr:badInput', ...
'The number of points and data values must be equal.');
end
opts.samples = 0;
opts.sigma = 0;
opts.sigmaGiven = 0;
opts.sigmaY = 0;
opts.lenScale = 0;
opts.dom = [];
opts.trig = 0;
for k = 1:length(varargin)
if ( strcmpi('trig', varargin{k}) )
opts.trig = k;
end
end
if opts.trig
varargin(opts.trig) = [];
end
for k = 1:2:length(varargin)
if ( strcmpi('samples', varargin{k}) )
opts.samples = varargin{k+1};
elseif ( strcmpi('sigma', varargin{k}) )
opts.sigmaGiven = 1;
opts.sigma = varargin{k+1};
elseif ( strcmpi('L', varargin{k}) )
opts.lenScale = varargin{k+1};
elseif ( strcmpi('domain', varargin{k}) )
opts.dom = varargin{k+1};
elseif ( strcmpi('noise', varargin{k}) )
opts.sigmaY = varargin{k+1};
else
error('CHEBFUN:CHEBFUN:gpr:badInput', ...
'Unrecognized sequence of input parameters.');
end
end
if isempty(opts.dom) % domain not provided, default to [min(x) max(x)]
if isempty(x)
opts.dom = [-1 1];
elseif length(x) == 1
opts.dom = [x-1 x+1];
elseif opts.trig
diff = (max(x)-min(x))/10;
opts.dom = [min(x) max(x)+diff];
else
opts.dom = [min(x) max(x)];
end
end
if ~isempty(x) && opts.trig % if domain endpoints are among data points,
% check to see if periodicity is enforced
[~,idMin] = min(x);
[~,idMax] = max(x);
if opts.dom(1) == x(idMin) && opts.dom(end) == x(idMax)
if y(idMin) ~= y(idMax)
end
end
end
if ~opts.lenScale % hyperparameters not specified
n = length(x);
if ~opts.sigmaGiven
opts.sigma = 1;
end
% Construct a chebfun approximation of the log marginal likelihood
% parametrized on the length scale. Use the length scale maximizing
% this function.
domSize = opts.dom(end)-opts.dom(1);
if opts.trig
searchDom = [1/(2*n) 10];
else
searchDom = [1/(2*pi*n)*domSize 10/pi*domSize];
end
% heuristic for reducing the optimization domain for the max log
% marginal likelihood estimation
fdom1 = logML(searchDom(1),x,y,opts);
fdom2 = logML(searchDom(2),x,y,opts);
while(fdom1 > fdom2 && searchDom(2)/searchDom(1) > 1+1e-4)
newBound = searchDom(1)+(searchDom(2) - searchDom(1))/10;
fdomnew = logML(newBound,x,y,opts);
if (fdomnew > fdom1)
break;
else
searchDom(2) = newBound;
fdom2 = fdomnew;
end
end
f = chebfun(@(z) logML(z,x,y,opts),searchDom, ...
'eps',1e-6,'splitting','on');
[~, opts.lenScale] = max(f);
end
end
% Computes the mean function estimate of the GP (using a Gaussian squared
% exponential kernel) at the points xEval
function fxEval = mean(alpha, x, xEval, opts)
n = length(x);
xEval = xEval(:);
m = length(xEval);
rx = repmat(xEval,1,n) - repmat(x',m,1);
if opts.trig
Kss = opts.sigma^2*exp(-2/(opts.lenScale^2) * ...
sin(pi/diff(opts.dom)*rx).^2) + opts.sigmaY*(rx == 0);
else
Kss = opts.sigma^2*exp(-1/(2*opts.lenScale^2)*rx.^2) + ...
opts.sigmaY*(rx == 0);
end
fxEval = Kss*alpha;
end
% Computes the log marginal likelihood estimate for a given array of
% hyperparameters (i.e., length scales)
function fxEval = logML(lenScale, x,y, opts)
fxEval = lenScale;
[r,c] = size(lenScale);
n = length(x);
rx = repmat(x,1,n) - repmat(x',n,1);
for i = 1:r
for j = 1:c
if opts.trig
K = opts.sigma^2*exp(-2/(lenScale(i,j)^2) * ...
sin(pi/diff(opts.dom)*rx).^2);
else
K = opts.sigma^2*exp(-1/(2*lenScale(i,j)^2) * rx.^2);
end
% compute the Cholesky decomposition of K
if opts.sigmaY ~= 0
L = chol(K+opts.sigmaY^2*eye(n), 'lower');
else
L = chol(K+1e-15*n*opts.sigma^2*eye(n), 'lower');
end
alpha = L'\(L\y);
% log marginal likelihood (see line 7 from Alg. 2.1 in [1])
fxEval(i,j) = -.5*y'*alpha - trace(log(L)) - n/2*log(2*pi);
end
end
end