In [1]:
using Flux
using Flux: update!

In [2]:
# So the construction for linear regression

# Create our observed data
fσ(x) = x ./ (1 .+ exp.(-1 .* x))
Wₜ = [1.1; 2.2; 3.3; 4; 5]                  # The true weights.   We don't observe these.
xₒ = rand(8,5)                             # Observed x
yₒ = fσ(xₒ * Wₜ)                            # Observed y

# The core regression
regress(x, W) = fσ(x * W)                    # Classic linear regression
regress(W) = regress(xₒ, W)                  # Close over the observed data

# The loss function,
loss(x, y, W) = sum((regress(x, W) .- y).^2)
loss(W) = loss(xₒ, yₒ, W);

In [3]:
# Initiate Wᵢ to random values
Wᵢ = rand(5,1)

# Now take the automatic differential of the loss wrt W.  This is a dloss/dW
dloss(W) = gradient(loss, W)[1]

# Descent() returns a function which performs a single iteration of the gradient descent algorithm
# I believe it is sometimes stateful, so we want to pull it out to give it global scope
opt = Descent(0.0001);

# update! uses that function to mutate the params Wᵢ with the gradient vector dloss(Wᵢ)
for i=1:1E6
    update!(opt, Wᵢ, dloss(Wᵢ))
end

Wᵢ

5×1 Matrix{Float64}:
 1.112074221721476
 2.205466017258204
 3.2778446054317927
 4.007941741352166
 4.99612555672165

In [None]:
# Lets try do it with a better optimization algorithm