# Machine Leaning Exercise 2: Logistic Regression
From Week 3 of Coursera course, Machine Learning by Andrew Ng: https://www.coursera.org/learn/machine-learning/. The topic is the logistic regression for clustering.

Eric Nam, https://github.com/eric-nam, 2020

# First Part of Exercise 2: Logistic Regression

## Load the first dataset
Load the dataset in the text file into a DataFrame.

In [None]:
import CSV
using DataFrames
fpath_csv = "ex2data1.txt"
df_data1 = CSV.File(fpath_csv, header=false) |> DataFrame!;

## Plot the first dataset

In [None]:
using Plots
xlabel = "Exam 1 Score"
ylabel = "Exam 2 Score"
scatter(df_data1.Column1, df_data1.Column2, group=df_data1.Column3,
    xlabel=xlabel, ylabel=ylabel, label=["Not admitted" "Admitted"])

## Define a sigmoid function
\begin{equation}
g(z) = \frac{1}{1 + e^{-z}}
\end{equation}

In [None]:
"""
    sigmoid(z)

Calculate the sigmoid function, ``g(z) = \\frac{1}{1 + e^{-z}}``

# Argument
- `z::Number`: input variable

# Return
`Number`
"""
function sigmoid(z)
    1.0 / (1.0 + exp(-z))
end

### Test the sigmoid function
Zero should yield 0.5.

In [None]:
sigmoid.([0 100 -100])

### Plot the sigmoid

In [None]:
xx = -10:0.25:10
plot(xx, sigmoid.(xx), label="Sigmoid")

## Define a cost function

\begin{align*}
h_{\theta} (x) &= g(\theta ^T x) \\
J(\theta) &= \frac{1}{m} \sum_i^m [ -y^{(i)} \log(h_{\theta}(x^{(i)})) - (1 - y^{(i)}) \log(1 - h_{\theta}(x^{(i)}))] \\
\frac{\partial J(\theta)}{\partial \theta_j} &= \frac{1}{m} \sum_i^m ( h_{\theta}(x^{(i)}) - y^{(i)} ) x_j^{(i)}
\end{align*}

In [None]:
"""
    cost_function(theta, x, y)

Compute the cost with with the dataset and theta

# Arguments
- `theta::{Number, 1}`: the coefficients of the cost function
- `x::Array{Number, 2}` : the independent variable matrix. The rows are examples, the columns features.
- `y::Array{Number, 1}` : the dependent vector.

# Returns
`Number`: cost
`Number{Number, 1}`: gradient
"""
function cost_function(theta, x, y)
    m, n = size(x)
    x1 = hcat(ones(m), x)
    sig = sigmoid.(x1 * theta)
    cost = (- y' * log.(sig) - (1.0 .- y)' * log.(1.0 .- sig)) / m 
    grad = ((sig .- y)' * x1) / m
    (cost, grad')
end

### Test the cost function
The cost function is written following the function signature in the exercise instruction though it is not ideal for the optimizer package used in the exercise.

With the zeto thetas, the expected cost and gradient are 0.693 and [-0.1000, -12.0092, -11.2628].

With the test thetas, the expected cost and gradient are 0.218 and [0.043, 2.566, 2.647].

In [None]:
x = Matrix(df_data1[:, 1:2])
y = df_data1[:, 3];

In [None]:
theta = [0, 0, 0]
cost_function(theta, x, y)

In [None]:
theta = [-24; 0.2; 0.2]
cost_function(theta, x, y)

## Optimize the cost with an optimizer
With zero initial coefficients, the expected cost and solution are 0.203 and [-25.161, 0.206, 0.201].

Optim package is used with the default options.

In [None]:
using Optim

In [None]:
result = optimize(t -> cost_function(t, x, y)[1], t -> cost_function(t, x, y)[2], [0., 0., 0.], inplace=false)

In [None]:
Optim.minimum(result)

In [None]:
theta_opt = Optim.minimizer(result)

## Plot the decision boundary

In [None]:
x_ends = [minimum(x[:, 1]), maximum(x[:, 1])]
y_ends = - (theta_opt[2] .* x_ends .+ theta_opt[1]) ./ theta_opt[3];

In [None]:
xlabel = "Exam 1 Score"
ylabel = "Exam 2 Score"
scatter(x[:, 1], x[:, 2], group=y,
    xlabel=xlabel, ylabel=ylabel, label=["Not admitted" "Admitted"])
plot!(x_ends, y_ends, label="Decision Boundary")

# Second Part of Exercise 2: Regularized Logistic Regression

## Read the data 

In [None]:
fpath_csv = "ex2data2.txt"
df_data2 = CSV.File(fpath_csv, header=false) |> DataFrame!;

## Plot the data

In [None]:
using Plots
xlabel = "Microchip Test 1"
ylabel = "Microchip Test 2"
x = Matrix(df_data2[:, 1:2])
y = df_data2[:, 3]
scatter(x[:, 1], x[:, 2], group=y, xlabel=xlabel, ylabel=ylabel, label=["y=0" "y=1"])

## Define a feature mapping function
Create a quadratic feature from two input variables.

In [None]:
function map_feature(x1, x2)
    degree = 6
    vcat(1, [x1 ^ (i - j) * x2 ^ j for i in 1:degree for j in 0:i])
end

In [None]:
size(x)

In [None]:
features = mapslices(r -> map_feature(r...), x, dims=2)

## Define a cost and a gradient function with regularization
\begin{align*}
h_{\theta} (x) &= g(\theta ^T x) \\
J(\theta) &= \frac{1}{m} \sum_i^m [ -y^{(i)} \log(h_{\theta}(x^{(i)})) - (1 - y^{(i)}) \log(1 - h_{\theta}(x^{(i)}))] + \frac{\lambda}{2 m} \sum_{j=1}^n \theta_j^2\\
\frac{\partial J(\theta)}{\partial \theta_0} &= \frac{1}{m} \sum_i^m ( h_{\theta}(x^{(i)}) - y^{(i)} ) x_j^{(i)} \quad \textrm{for} \quad j=0\\
\frac{\partial J(\theta)}{\partial \theta_j} &= \frac{1}{m} \sum_i^m ( h_{\theta}(x^{(i)}) - y^{(i)} ) x_j^{(i)} + \frac{\lambda}{m} \theta_j 
\end{align*}

In [None]:
function cost_function_reg(theta, x, y, lambda)
    m, _ = size(x)
    sig = sigmoid.(x * theta)
    (- y' * log.(sig) - (1.0 .- y)' * log.(1.0 .- sig)) / m + lambda * 0.5 / m * (theta[2:end]' * theta[2:end])
end

In [None]:
function cost_gradient_function_reg(theta, x, y, lambda)
    m, n = size(x)
    sig = sigmoid.(x * theta)
    lambdas = fill(lambda, n)
    lambdas[1] = 0.
    (x' * (sig - y) + lambdas .* theta) / m
end

### Test the cost function
With all zero $\theta$s and $\lambda=1$, the expected cost is 0.693.

With all one $\theta$s and $\lambda=10$, the expected cost is 3.16.

In [None]:
theta = zeros(size(features)[2])
lambda = 1.0

cost_function_reg(theta, features, y, lambda)

In [None]:
theta = ones(size(features)[2])
lambda = 10.

cost_function_reg(theta, features, y, lambda)

### Test the gradient function
With all zero $\theta$s and $\lambda=1$, the expected result is [.0085, 0.0188, 0.0001, 0.0503, 0.0115, ...].

With all zero $\theta$s and $\lambda=1$, the expected result is [0.3460, 0.1614, 0.1948, 0.2269, 0.0922, ...].

In [None]:
theta = zeros(size(features)[2])
lambda = 1.0

cost_gradient_function_reg(theta, features, y, lambda)[1:5]

In [None]:
theta = ones(size(features)[2])
lambda = 10.0

cost_gradient_function_reg(theta, features, y, lambda)[1:5]

## Optimize

In [None]:
theta_init = zeros(size(features)[2])
lambda = 1.

result = optimize(t -> cost_function_reg(t, features, y, lambda),
                  t -> cost_gradient_function_reg(t, features, y, lambda), theta_init, inplace=false)

In [None]:
theta_opt = Optim.minimizer(result)

## Draw the decision boundary

In [None]:
xx = range(minimum(x[:, 1]) - 0.2, length=50, stop=maximum(x[:, 1]) + 0.2)
yy = range(minimum(x[:, 2]) - 0.2, length=50, stop=maximum(x[:, 2]) + 0.2)

In [None]:
zz = map(v -> theta_opt' * map_feature(v...), Iterators.product(xx, yy))
scatter(x[:, 1], x[:, 2], group=y, xlabel=xlabel, ylabel=ylabel, label=["y=0" "y=1"])
contour!(xx, yy, zz', levels=[0], label="Decision Boundary",
         colorbar=nothing, color=cgrad(:rainbow), aspect_ratio=:equal)

In [None]:
surface(xx, yy, zz', zlim=[-5, 5])

### Try it again with different lambdas

In [None]:
theta_init = zeros(size(features)[2])
lambda = 100.

result = optimize(t -> cost_function_reg(t, features, y, lambda),
                  t -> cost_gradient_function_reg(t, features, y, lambda), theta_init, inplace=false)

In [None]:
theta_opt = Optim.minimizer(result)

In [None]:
zz = map(v -> theta_opt' * map_feature(v...), Iterators.product(xx, yy))
scatter(x[:, 1], x[:, 2], group=y, xlabel=xlabel, ylabel=ylabel, label=["y=0" "y=1"])
contour!(xx, yy, zz', levels=[0], label="Decision Boundary",
         colorbar=nothing, color=cgrad(:rainbow), aspect_ratio=:equal)

In [None]:
surface(xx, yy, zz', zlim=[-1, 1])

In [None]:
theta_init = zeros(size(features)[2])
lambda = 0.

result = optimize(t -> cost_function_reg(t, features, y, lambda),
                  t -> cost_gradient_function_reg(t, features, y, lambda), theta_init, inplace=false)

In [None]:
theta_opt = Optim.minimizer(result)

In [None]:
zz = map(v -> theta_opt' * map_feature(v...), Iterators.product(xx, yy))
scatter(x[:, 1], x[:, 2], group=y, xlabel=xlabel, ylabel=ylabel, label=["y=0" "y=1"])
contour!(xx, yy, zz', levels=[0], label="Decision Boundary",
         colorbar=nothing, color=cgrad(:rainbow), aspect_ratio=:equal)

In [None]:
surface(xx, yy, zz', zlim=[-10, 10])