# Zeroth Pipeline

This pipeline compares zeroth order GD against standard GD. Three different zeroth order approximations --- one difference, two difference, and coordinate --- are tested.

### Imports

In [None]:
import os
import sys

p = os.path.abspath('../src/')
if p not in sys.path:
    sys.path.append(p)

In [None]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
import time

from difference_methods import one_point_estimate, two_point_estimate, coordinate_estimate
from linear_zeroth_function import Linear_Zeroth
from relu_zeroth_function import ReLU_Zeroth

## Data preprocessing

In [None]:
proper_test = False # For testing the pipeline using iris instead of pendigits

if proper_test:
    data_train = data_train = pd.read_csv('../data/pendigits.csv')
else:
    data_train = pd.read_csv('../data/iris_train.csv')

print(data_train)

In [None]:
if proper_test:
    # extract frequency of each class
    class_freq = data_train['class'].value_counts()
else:
    # Convert string target values to numeric values
    #       class 0: Iris-setosa
    #       class 1: Iris-versicolor
    #       class 2: Iris-virginica
    data_train['species'] = data_train['species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})
    # also convert all string numeric values to float ['2' -> 2.0]
    data_train = data_train.apply(pd.to_numeric)
    # extract frequency of each species class
    class_freq = data_train['species'].value_counts()

class_freq = list(class_freq.sort_index())
nb_classes = len(class_freq)
nb_features = data_train.shape[1]-1

# Visual data
graph = plt.bar(list(range(nb_classes)), class_freq)
plt.xticks(list(range(nb_classes)))
plt.ylabel('Frequency')
plt.xlabel('Class')
plt.title('Full data')

plt.show() # balanced data set

In [None]:
# Preprocess data for training

# Convert pandas dataframe to array
data_train_array = data_train.values

# Split x (features) and y (targets)
x_array = data_train_array[:, :nb_features]
y_array = data_train_array[:, nb_features]

# Tensorify
X = torch.tensor(x_array, dtype=torch.float)
Y = torch.tensor(y_array, dtype=torch.long)

print(X.shape)

## Network Setup

In [None]:
# INPUT/OUTPUT layer sizes
INPUT_NEURONS = nb_features
OUTPUT_NEURONS = nb_classes

In [None]:
# Network architectures: 1 hidden layer

# Normal network with relu
def create_standard_relu_network(hidden_neurons, mu, n, difference_method):
    torch.manual_seed(1) # so that same network init with same weights at each creation
    return torch.nn.Sequential(
        torch.nn.Linear(INPUT_NEURONS, hidden_neurons),
        torch.nn.ReLU(),
        torch.nn.Linear(hidden_neurons, OUTPUT_NEURONS)
    )

# Full Zeroth Order relu
def create_zeroth_relu_network(hidden_neurons, mu, n, difference_method):
    torch.manual_seed(1) # so that same network init with same weights at each creation
    return torch.nn.Sequential(
        Linear_Zeroth(INPUT_NEURONS, hidden_neurons, bias=False, difference_method=difference_method, mu=mu, n=n),
        ReLU_Zeroth(difference_method=difference_method, mu=mu, n=n),
        Linear_Zeroth(hidden_neurons, OUTPUT_NEURONS, bias=False, difference_method=difference_method, mu=mu, n=n),
)

networks_funs = [
    ("standard_relu", create_standard_relu_network),
    ("zeroth_relu", create_zeroth_relu_network),
]

## Train and Plot Losses

In [None]:
# Plot or save? False => Save
plot = False
path = "plots/"

# Plotting
colours = ["black", "red", "darkblue", "green", "violet",]
linestyles = ['solid', 'dashdot', 'dashed', 'dotted']

### Initial Plot

In [None]:
# Training constants
LR = 0.01
NUM_EPOCH = 100
BATCH_SIZE = 32

# Loss function
loss_func = torch.nn.CrossEntropyLoss()

# Test Variables
HIDDEN_NEURONS = max(INPUT_NEURONS, OUTPUT_NEURONS)
MU = 0.01
N = 1000

# Methods
initial_plot_difference_method_range = ['two', 'coord', 'sgd']

In [None]:
# Setup plot
fig = plt.figure(figsize=(10, 6), constrained_layout=True)
fig.suptitle(f"Zeroth Order Descent Methods vs SGD", fontsize=15, x=0.5)

subplots = fig.subplots(nrows=1, ncols=2, sharey=True)

# Iterate methods
for (i, diff_method) in tqdm(enumerate(initial_plot_difference_method_range), desc="method", leave=False, position=0):
    # Create new instance of network
    if diff_method == "sgd":
        name, network_fun = networks_funs[0]
    else:
        name, network_fun = networks_funs[1]
    network = network_fun(HIDDEN_NEURONS, MU, N, diff_method)

    # Optimiser
    optimiser = torch.optim.SGD(network.parameters(), lr=LR)

    # store all losses for visualisation
    losses = []
    losses.append(loss_func(network(X), Y).item())
    # Times
    times = [time.time()]

    # train a neural network
    num_batches = len(X) // BATCH_SIZE
    for epoch in tqdm(range(NUM_EPOCH), desc="epoch", leave=False, position=1):
        epoch_loss = 0.0
        for batch in range(num_batches):
            # Get batch data
            batch_start = batch * BATCH_SIZE
            batch_end = (batch + 1) * BATCH_SIZE
            batch_X = X[batch_start:batch_end]
            batch_Y = Y[batch_start:batch_end]

            # Perform forward pass
            Y_pred = network(batch_X)
            # Compute loss
            loss = loss_func(Y_pred, batch_Y)
            epoch_loss += loss.item()

            # Clear the gradients before running the backward pass
            network.zero_grad()
            # Perform backward pass
            loss.backward()
            # Step optimiser
            optimiser.step()
        
        # Prevent gradient explosion
        if np.isnan(epoch_loss) or np.isinf(epoch_loss) or (len(losses) != 0 and epoch_loss > 100000 * losses[0]):
                break

        losses.append(epoch_loss / num_batches)
        times.append(time.time())

    # Plot: Epoch vs Loss
    subplots[0].plot(losses, label="two-point" if diff_method == 'two' else ("coordinate estimate" if diff_method == 'coord' else "sgd"),
                      c=colours[i], linestyle=linestyles[0], linewidth=2.0, alpha=0.6)
    subplots[0].set_xlabel("Epoch", weight='bold', fontsize=10)
    subplots[0].set_ylabel("Loss", weight='bold', fontsize=10)
    
    # Plot: Time vs Loss
    subplots[1].plot(list(map(lambda x: x - times[0] + 0.001, times)), losses,
                      c=colours[i], linestyle=linestyles[0], linewidth=2.0, alpha=0.6)
    subplots[1].set_xscale('log')
    subplots[1].set_xlabel("Computation Time (s)", weight='bold', fontsize=10)
# Format and Save
fig.legend(loc="lower center", fontsize=10, bbox_to_anchor=(0.5, -0.0075), labelspacing=1.0, ncols=4)
fig.savefig(f"{path}hn{HIDDEN_NEURONS}_mu{MU}_n{N}_epoch{NUM_EPOCH}_initial.png")