In [4]:
using Plots
plotly()

# data
X = [3 5; 5 1; 10 2];
y = [75; 82; 93];

# weights
w1 = randn(2,3);
w2 = randn(3,1);

# stat functions for normalization of data
mu(xs, n) = sum(xs) / n;
norm(xs, μ, min, max) = map(x -> (x - μ) / (max - min));

# simple normalization
X = X./maximum(X);
y = y./100;

# the activation function
function sigmoid(x)
    1 / (1 + e^(-x))
end;

# derivative of the activation function
function sigmoidPrime(x)
    e^(-x) / ((1 + e^(-x))^2)
end;

function g(xs)
    map(sigmoid, xs);
end;

function gPrime(xs)
    map(sigmoidPrime, xs);
end;

function forward(X, w1, w2)
    z2 = X * w1;
    a2 = g(z2);
    z3 = a2 * w2;
    yhat = g(z3);
    
    yhat, z3, z2, a2
end;

function cost(X, y, yhat)
    J = 0.5 * sum(map(x -> x^2, y - yhat));
end;

# computes cost gradient ∇J = [dJ/dW1; dJ/dW2]
function costPrime(X, y, yhat, z3, z2, a2, lambda)
    delta3 = (-(y - yhat) .* gPrime(z3));
    dJdW2 = (a2 * delta3) + (w2.* lambda);
    delta2 = dot(delta3, w2.') * gPrime(z2);
    dJdW1 = *(X.', delta2) +  (w1.* lambda);
    
    dJdW1, dJdW2;
end;

# regularization term
function regTerm(lambda, w1, w2)
    (lambda/2) * (sum(w1.^2) + sum(w2.^2));
end

#=
Recursive Train Function
 w1, w2 - weights
 alpha - learning rate
 lambda - regularization parameter
 history - values of cost J for each iteration
 j - counter
 steps - how many iteration to make before returning the result
=# 
function train(w1, w2, alpha, lambda, history, j, steps)
    if(j == steps)
        return w1, w2, history;
    end
    
    yhat, z3, z2, a2 = forward(X, w1, w2);
    J = cost(X, y, yhat) + regTerm(lambda, w1, w2);
    dJdW1, dJdW2 = costPrime(X, y, yhat, z3, z2, a2, lambda); 
    w1 = w1 - (dJdW1 .* alpha);
    w2 = w2 - (dJdW2 .* alpha);
    append!(history, J);
    
    train(w1, w2, alpha, lambda, history, j+=1, steps);
end;

w1, w2, history = train(w1, w2, 0.3, 0.0001, zeros(0), 1, 30);

In [5]:
plot(history, title="Cost")