# ECE4179 - Assignment 1
Evan Tan 27401995 \
etan0008@student.monash.edu

## COMMENT CELL BELOW WHEN SUBMITTING

In [None]:
%load_ext lab_black
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

## Functions

### 2.1

In [None]:
def sigmoid(x: np.ndarray):
    """Computes the sigmoid of x"""
    return 1 / (1 + np.exp(-x))

### 2.4

In [None]:
def predict(X, theta):
    """Predict labels based on learned model parameters
    :param X: (M,N) samples, where M is the number of samples
    :type X: np.ndarray
    :param theta: (1,N) model parameters, N = number of features
    :type theta: np.ndarray
    :return: predictions of labels
    :rtype: np.ndarray.astype(int)
    """
    # jank boolean conversion
    return (sigmoid(X @ theta.T) >= 0.5).astype(int)

### 2.2

In [None]:
def compute_grad_loss(X, y, theta):
    """Compute gradient and loss"""
    # alternative
    # epsilon = 7.0 / 3 - 4.0 / 3 - 1
    sig_wx = sigmoid(X @ theta.T)
    loss = -np.mean(
        y * np.log(sig_wx + np.finfo(float).eps)
        + (1 - y) * np.log(1 - sig_wx + np.finfo(float).eps)
    )
    # numpy still screams log(0) errors using this
    # sig_wx[sig_wx < np.finfo(float).eps] = np.finfo(float).eps
    # loss = -np.mean(y * np.log(sig_wx) + (1 - y) * np.log(1 - sig_wx))

    # calculate mean across all datapoints
    grad_vec = (sig_wx - y).T @ X / X.shape[0]
    # alternatively
    # grad_vec = np.mean((sig_wx - y) * X, axis=0, keepdims=True)

    return loss, grad_vec

### 2.3

In [None]:
def do_gradient_descent(LR=5e-2, epochs=1e3):
    """Perform gradient descent"""
    np.random.seed(0)
    # randomly initilize theta, the parameters of the logistic model
    # theta = np.random.randn(X_train.shape[1], 1)  # shape (2,1)
    theta = np.random.randn(1, X_train.shape[1])  # shape (1,2)

    # this is the learning rate of the GD algorithm, you need to tune this
    # and study its effects in your report
    lr = LR

    # this is the maximum number of iterations of the GD algorithm.
    # Since we use the GD, each iteration of the algorithm is equivalent
    # to one epoch, hence the name
    max_epoch = int(epochs)

    # keep track of the loss/accuracy values for plotting
    loss = np.zeros(max_epoch)
    accuracy = np.zeros(max_epoch)
    log_interval = max_epoch // 10
    for epoch in range(max_epoch):
        # call the compute_grad_loss that you have implemented above to
        # measure the loss and the gradient
        loss[epoch], grad_vec = compute_grad_loss(X_train, y_train, theta)
        # update the theta parameter according to the GD here
        theta -= lr * grad_vec

        # storage for plotting
        y_test_hat = predict(X_test, theta)
        accuracy[epoch] = float(sum(y_test_hat == y_test)) / float(len(y_test))

        if (epoch + 1) % log_interval == 0:
            print(
                f"Epoch:{epoch+1}/{max_epoch} \
                Loss: {loss[epoch]:.6f} \
                Acc: {accuracy[epoch]:.6f}"
            )
    return theta, loss, accuracy

### Data Augmentation

In [None]:
def make_nonlinear(X):
    """Augment dataset from 2D to 5D feature space"""
    X = np.c_[X, X[:, 0] ** 2, X[:, 1] ** 2, X[:, 0] * X[:, 1]]
    return X

# Data Loading

In [None]:
npzfile = np.load("toy_data.npz")  # toy_data.npz or toy_data_two_circles.npz
# npzfile = np.load("toy_data_two_circles.npz")

# remember that each row in X_train and X_test is a sample. so X_train[0,:] is the first training sample
X_train = npzfile["arr_0"]
X_test = npzfile["arr_1"]
y_train = npzfile["arr_2"]
y_test = npzfile["arr_3"]

In [None]:
add_bias = True
augment_data = True

In [None]:
if "circles" in npzfile.fid.name and augment_data:
    X_train = make_nonlinear(X_train)
    X_test = make_nonlinear(X_test)

if add_bias:
    X_train = np.c_[X_train, np.ones([X_train.shape[0], 1])]
    X_test = np.c_[X_test, np.ones([X_test.shape[0], 1])]

# Data Visualization

In [None]:
plt.subplot(121)
plt.scatter(
    X_train[:, 0], X_train[:, 1], marker="o", c=y_train[:, 0], s=25, edgecolor="k"
)
plt.subplot(122)
plt.scatter(X_test[:, 0], X_test[:, 1], marker="o", c=y_test[:, 0], s=25, edgecolor="k")

# Model Training and Evaluation

In [None]:
# now that you have trained your model, let's evaluate it
# first call the predict function on your test data with
# the parameters obtained by GD
theta, loss, accuracy = do_gradient_descent(LR=0.5, epochs=1e3)
y_test_hat = predict(X_test, theta)

print()

# make sure that the predictions are either 0 or 1 and the shape of y_test_hat
print((y_test_hat >= 0).all() and (y_test_hat <= 1).all())
print(y_test_hat.shape == y_test.shape)

# the script below, if the dimensionality of the arrays is set correctly,
# will measure how many samples are correctly classified by your model
score = float(sum(y_test_hat == y_test)) / float(len(y_test))
print(score)

# Loss and Accuracy Curves

In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(accuracy, "r-")
ax2.plot(loss, "g-")

ax1.set_xlabel("Number of epochs")
ax1.set_ylabel("Accuracy", color="r")
ax2.set_ylabel("Loss", color="g")
plt.title("Test Accuracy and Training Loss")
fig.set_figheight(6)
fig.set_figwidth(8)

plt.show()

# Learning Rate Tuning
Instability: Begins at 15 in terms of accuracy curve, but 20 for loss curve \
Slow Convergence: 5e-5 and below (if too low it does not converge in 1000 epochs) \
Ideal Convergence: 0.5

In [None]:
_, *ideal = do_gradient_descent(LR=0.5, epochs=1e3)

In [None]:
_, *slow = do_gradient_descent(LR=0.005, epochs=1e3)

In [None]:
# _, *unstable = do_gradient_descent(LR=22, epochs=50) # mild instability
_, *unstable = do_gradient_descent(LR=5, epochs=1e3)  # full instability

In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(unstable[1], "r-")
ax2.plot(unstable[0], "g-")

ax1.set_xlabel("Number of epochs")
ax1.set_ylabel("Accuracy", color="r")
ax2.set_ylabel("Loss", color="g")
plt.title("Test Accuracy and Training Loss")
fig.set_figheight(6)
fig.set_figwidth(8)

plt.show()

# Decision Boundary Plotting

In [None]:
# get points for [-5,5] X [-5,5]
N = 100
x = np.linspace(-5, 5, N)
y = np.linspace(-5, 5, N)
f1, f2 = np.meshgrid(x, y)
# create (N X N, M) array for our predict function
model_input = np.c_[f1.flatten(), f2.flatten()]

if "circles" in npzfile.fid.name and augment_data:
    model_input = make_nonlinear(model_input)

if add_bias:
    model_input = np.c_[model_input, np.ones([model_input.shape[0], 1])]

In [None]:
theta, *_ = do_gradient_descent(LR=0.5, epochs=1e3)
# create (N X N, 1) output
predictions = predict(model_input, theta)
# reshape to match our feature space
predictions = predictions.squeeze().reshape(f1.shape)

In [None]:
fig, ax = plt.subplots()

colors = ["blue", "red"]

# set colors for our data
gt_colors = np.copy(y_test).astype("object")
gt_colors[gt_colors == 0] = colors[0]
gt_colors[gt_colors == 1] = colors[1]
gt_colors = gt_colors.ravel().tolist()
plt.scatter(
    X_test[:, 0], X_test[:, 1], marker="o", c=gt_colors, s=25, edgecolor="k", alpha=0.5
)

# get unique labels
# colorbar_levels = list(np.unique(predictions.flatten()))
colorbar_levels = [0, 0.5, 1]
img = ax.contourf(f1, f2, predictions, levels=colorbar_levels, cmap=cm.bwr, alpha=0.75)
plt.colorbar(img)

## Exploration

In [None]:
n_ones = len(y_test[y_test == 1])
n_ones

In [None]:
n_zeros = len(y_test[y_test == 0])
n_zeros

In [None]:
theta