forked from amaas/stanford_dl_ex
-
Notifications
You must be signed in to change notification settings - Fork 0
/
supervised_dnn_cost.m
153 lines (131 loc) · 5.07 KB
/
supervised_dnn_cost.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
function [ cost, grad, pred_prob] = supervised_dnn_cost( theta, ei, data, labels, pred_only)
% SUPERVISED_DNN_COST Cost function for supervised neural network
% Does all the work of cost / gradient computation
% Returns: total cost;
% gradient for each W and b at each layer (as a stack);
% and prediction probabilities for each class for each sample
%% default values
po = false;
if exist('pred_only','var')
po = pred_only;
end;
%% reshape into network
% stack{1} has weights W and biases b that combine with data X
% to form inputs to 1st hidden layer;
% stack{end} has W and b that combine with last hidden layer
% to form output pred_probs (unnormalized)
stack = params2stack(theta, ei);
nrLayers = numel(ei.layer_sizes) + 1;
stackDepth = nrLayers - 1;
a = cell(nrLayers, 1); % 1st layer is data, last is output (unnormalized)
gradStack = cell(stackDepth, 1);
nrSamples = size(data, 2);
%% forward prop
%%% YOUR CODE HERE %%%
% For layer l = 2:nrLayers
% (to be consistent with the tutorial notes:
% with a single hidden layer,
% X is layer 1, hidden is layer 2, output is layer 3;
% so that's a stack depth of 2 = nr of layers BESIDES output)
%
a{1} = data; % a(1)=X;
for l = 2:nrLayers
% z(l) = [W*a + b](l-1), for each sample,
% where b is col.vector with same nr rows as W
z = bsxfun(@plus, stack{l-1}.W*a{l-1}, stack{l-1}.b);
if l == nrLayers && strcmp(ei.output_type, 'categorical')
% If classifying into categories, then for last layer,
% don't do full sigmoid, just exp, to calculate softmax below in pred_prob
a{l} = exp(z);
else
% For other layers, or for continuous output, DO calculate sigmoid
a{l} = f(z);
end
end
% IF categorical,
% normalize final layer, so each sample's outputs sum to 1,
% so that they are actual prediction probabilities.
% Final layer outputs are in a, with nrClasses rows and nrSamples cols,
% so we need something like (a ./ sum(a,1)), arranged right...
% but IF continuous, just use a{nrLayers} as the prediction.
switch ei.output_type
case 'categorical'
pred_prob = bsxfun(@rdivide, a{nrLayers}, sum(a{nrLayers},1));
case 'continuous'
pred_prob = a{nrLayers};
end
%% return here if only predictions desired.
if po
cost = -1;
grad = [];
return;
end;
%% compute cost
%%% YOUR CODE HERE %%%
switch ei.output_type
case 'categorical'
% Find linear equivalents of matrix indices (i,j)
% where, in j'th sample (column), i=y(j)
% i.e. the row ID is the class of that column
IDs = sub2ind(size(pred_prob), labels', 1:nrSamples);
% Compute binary matrix where IDs are 1 and rest are 0
y_matrix = zeros(size(pred_prob));
y_matrix(IDs) = 1;
% Cost function f=J(theta), NOT yet including L2 penalty on the weights
cost = -sum(log(pred_prob(IDs)));
case 'continuous' % use squared error loss, but DON'T divide by NrSamples
cost = .5 .* sum(sum((pred_prob - labels).^2));
end
%% compute gradients using backpropagation
%%% YOUR CODE HERE %%%
% For layers 2:nrLayers, we will need a delta vector
% (so 1st element of deltaStack will just stay empty)
deltaStack = cell(nrLayers, 1);
% The last layer's delta vector is just the residuals,
% NOT summed over samples like the tutorial mistakenly says.
% We want one delta per output unit AND per sample.
switch ei.output_type
case 'categorical'
deltaStack{nrLayers} = pred_prob - y_matrix;
case 'continuous'
deltaStack{nrLayers} = (pred_prob - labels) .* (a{nrLayers}.*(1-a{nrLayers}));
end
% Backpropagate to get deltas for previous layers
for l=((nrLayers-1):-1:2)
% delta{l} = [W{l}' * delta{l+1}] .* [deriv of f wrt z{l}]
% TODO: if generalizing to different function f,
% replace the last product a.*(1-a) with the appropriate derivative of f at z
deltaStack{l} = (stack{l}.W'*deltaStack{l+1}) .* (a{l}.*(1-a{l}));
end
% Compute gradient (sum of contributions of each sample)
% at each layer, for W and b
% (still WITHOUT the L2 penalty on the weights)
% NOTE: Take SUMS not MEANS since our loss function is a sum not a mean
% (unlike sq.err. loss example in tutorial)
% ...and for simplicity, let's also do sums not means for the sq.err. loss too with continuous outputs
for l=1:stackDepth
% Grad of W{l}(i,j) is matrix of sums of delta{l+1}(i)*a{l}(j) across samples
gradStack{l}.W = deltaStack{l+1} * a{l}'; %' % Notepad++ doesn't recognize the 1st ' as transpose
% For grad of b, take sum within rows of delta{l+1} i.e. across samples
gradStack{l}.b = sum(deltaStack{l+1}, 2);
end
%% compute L2 weight penalty cost and gradient for non-bias terms
%%% YOUR CODE HERE %%%
if ei.lambda ~= 0
% For each layer with weights W in it...
for l=1:stackDepth
% Add to the cost: sum of squared weights multiplied by lambda/2
cost = cost + (ei.lambda/2)*sum(sum(stack{l}.W .^ 2));
% Add to the gradient: weight times lambda
gradStack{l}.W = gradStack{l}.W + ei.lambda.*stack{l}.W;
end
end
%% reshape gradients into vector, so gradient search can optimize them all at once
[grad] = stack2params(gradStack);
end
% Helper subfunctions:
% Let f be the sigmoid, logistic, or inverse-logit function.
% We can apply it element-wise.
function h=f(z)
h = logsig(z);
end