# Gradient Checking

We will check the gradient computation using the circle data. We will check it for both the `mse` loss and `cce` loss.

## Todo

1. Address issues w/ derivative at boundary of ReLu function.

## Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import node
import layer
import importlib
import neuralnetwork

from node import mse, mse_grad, bce, bce_grad, cce, cce_grad

from typing import List
from operator import add, mul, matmul

import torch
import time

## Set Precision

In [2]:
torch.set_default_dtype(torch.float64)

## For BCE 

In [3]:
# limit set at 10^-5
# will raise warning if relative error exceeds limit
# warning doesn't necessarily mean error

training_points = 10000

run_circle_check = True

use_gpu = False

if use_gpu:
    if torch.cuda.is_available():
        my_device = torch.device("cuda")
        print("Using CUDA gpu!")
    elif torch.backends.mps.is_available():
        my_device = torch.device("mps")
        print("Using MPS gpu!")
else:
    my_device = torch.device("cpu")
    print("Using the cpu!")

if run_circle_check:
    
    X_train = np.random.rand(training_points, 2) * 4 - 2
    #X_train = np.array([[1, 10]])
    y_train_true = np.power(X_train, 2).sum(axis = 1)  > 1
    
    X_train_formatted = torch.Tensor(X_train.T).to(my_device)
    y_train_formatted = torch.Tensor(y_train_true.T).to(my_device).unsqueeze(dim = 0)
 
    model = neuralnetwork.NeuralNetwork([(2, "id"), 
                                         (10, "tanh"), 
                                         (10, "tanh"), 
                                         (1, "id")])
    model.randomize_params()
    
    model.to_device(my_device)
    
    model_delta = neuralnetwork.NeuralNetwork([(2, "id"), 
                                         (10, "tanh"), 
                                         (10, "tanh"), 
                                         (1, "id")])
    
    epochs = 3
    
    start_time = time.perf_counter()

    for i in range(epochs):
        
        # over each epoch, we randomly split the data into batches then train
        
        print(f"Starting epoch {i}")
        batch_size = int(training_points / 10)
        
        shuffled = torch.randperm(training_points)
        
        X_shuffled = X_train_formatted[ : , shuffled]
        y_shuffled = y_train_formatted[ : , shuffled]
        
        
        X_partitions = torch.split(X_shuffled, batch_size, dim = 1)
        y_partitions = torch.split(y_shuffled, batch_size, dim = 1)
        
        for X_batch, y_batch in zip(X_partitions, y_partitions):
        
            model.copy(model_delta)
            model.compute_grads(X_batch, y_batch.float(), error = "bce")
            orig_loss = bce(y_batch.float(), model_delta.predict(X_batch))

            h = 0.00001
            limit = 0.0001
            for k, param in enumerate(model.params):
                
                    delta = torch.zeros(param.weight.forward_value.shape)
                    m, n = delta.shape
                    for i in range(m):
                        for j in range(n):
                            
                            delta[i][j] = h
                            
                            model_delta.add_delta(k, "weight", delta)
                            plus_loss = bce(y_batch.float(), model_delta.predict(X_batch))
                            
                            model_delta.add_delta(k, "weight", -delta,)
                            model_delta.add_delta(k, "weight", -delta,)
                            minus_loss = bce(y_batch.float(), model_delta.predict(X_batch))
                            
                            estimated_grad = (plus_loss - minus_loss) / (2 * h)
                            true_grad = param.weight.backward_value[i][j]
                            diff = estimated_grad - true_grad
                            relative_error = torch.norm(diff) / torch.norm(estimated_grad + true_grad) 
                            
                            if (relative_error > limit):
                                print(f"Concerning relative error detected for weight in layer {k} in entry {i, j}! The error was {relative_error}")
                        
                            model_delta.add_delta(k, "weight", delta)
                            
                            delta[i][j] = 0
                            
                    delta = torch.zeros(param.bias.forward_value.shape)
                    m, n = delta.shape
                    for i in range(m):
                        for j in range(n):
                            
                            delta[i][j] = h
                            
                            model_delta.add_delta(k, "bias", delta)
        
                            plus_loss = bce(y_batch.float(), model_delta.predict(X_batch))
                            
                            model_delta.add_delta(k, "bias", -delta,)
                            model_delta.add_delta(k, "bias", -delta,)
                            minus_loss = bce(y_batch.float(), model_delta.predict(X_batch))
                            
                            estimated_grad = (plus_loss - minus_loss) / (2 * h)
                            true_grad = param.bias.backward_value[i][j]
                            diff = estimated_grad - true_grad
                            relative_error = torch.norm(diff) / torch.norm(estimated_grad + true_grad) 
                            
                            if (relative_error > limit):
                                print(f"Concerning relative error detected for bias in layer {k} in entry {i, j}! The error was {relative_error}")
                        
                            
                            model_delta.add_delta(k, "bias", delta)
                            
                            delta[i][j] = 0
                            
            model.update(learning_rate = 0.1)
        
    end_time = time.perf_counter()
    print(f"Training took {end_time - start_time} seconds on the {my_device}")
    

Using the cpu!
Starting epoch 0
Starting epoch 1
Starting epoch 2
Training took 2.3763794580008835 seconds on the cpu


## For MSE

In [4]:
# limit set at 10^-5
# will raise warning if relative error exceeds limit
# warning doesn't necessarily mean error

training_points = 10000

run_circle_check = True

use_gpu = False

if use_gpu:
    if torch.cuda.is_available():
        my_device = torch.device("cuda")
        print("Using CUDA gpu!")
    elif torch.backends.mps.is_available():
        my_device = torch.device("mps")
        print("Using MPS gpu!")
else:
    my_device = torch.device("cpu")
    print("Using the cpu!")

if run_circle_check:
    
    X_train = np.random.rand(training_points, 2) * 4 - 2
    #X_train = np.array([[1, 10]])
    y_train_true = np.power(X_train, 2).sum(axis = 1)  > 1
    
    X_train_formatted = torch.Tensor(X_train.T).to(my_device)
    y_train_formatted = torch.Tensor(y_train_true.T).to(my_device).unsqueeze(dim = 0)
 
    model = neuralnetwork.NeuralNetwork([(2, "id"), 
                                         (10, "tanh"), 
                                         (10, "tanh"), 
                                         (1, "id")])
    model.randomize_params()
    
    model.to_device(my_device)
    
    model_delta = neuralnetwork.NeuralNetwork([(2, "id"), 
                                         (10, "tanh"), 
                                         (10, "tanh"), 
                                         (1, "id")])
    
    epochs = 3
    
    start_time = time.perf_counter()

    for i in range(epochs):
        
        # over each epoch, we randomly split the data into batches then train
        
        print(f"Starting epoch {i}")
        batch_size = int(training_points / 10)
        
        shuffled = torch.randperm(training_points)
        
        X_shuffled = X_train_formatted[ : , shuffled]
        y_shuffled = y_train_formatted[ : , shuffled]
        
        
        X_partitions = torch.split(X_shuffled, batch_size, dim = 1)
        y_partitions = torch.split(y_shuffled, batch_size, dim = 1)
        
        for X_batch, y_batch in zip(X_partitions, y_partitions):
        
            model.copy(model_delta)
            model.compute_grads(X_batch, y_batch.float(), error = "mse")
            orig_loss = mse(y_batch.float(), model_delta.predict(X_batch))

            h = 0.00001
            limit = 0.0001
            for k, param in enumerate(model.params):
                
                    delta = torch.zeros(param.weight.forward_value.shape)
                    m, n = delta.shape
                    for i in range(m):
                        for j in range(n):
                            
                            delta[i][j] = h
                            
                            model_delta.add_delta(k, "weight", delta)
                            plus_loss = mse(y_batch.float(), model_delta.predict(X_batch))
                            
                            model_delta.add_delta(k, "weight", -delta,)
                            model_delta.add_delta(k, "weight", -delta,)
                            minus_loss = mse(y_batch.float(), model_delta.predict(X_batch))
                            
                            estimated_grad = (plus_loss - minus_loss) / (2 * h)
                            true_grad = param.weight.backward_value[i][j]
                            diff = estimated_grad - true_grad
                            relative_error = torch.norm(diff) / torch.norm(estimated_grad + true_grad) 
                            
                            if (relative_error > limit):
                                print(f"Concerning relative error detected for weight in layer {k} in entry {i, j}! The error was {relative_error}")
                        
                            model_delta.add_delta(k, "weight", delta)
                            
                            delta[i][j] = 0
                            
                    delta = torch.zeros(param.bias.forward_value.shape)
                    m, n = delta.shape
                    for i in range(m):
                        for j in range(n):
                            
                            delta[i][j] = h
                            
                            model_delta.add_delta(k, "bias", delta)
        
                            plus_loss = mse(y_batch.float(), model_delta.predict(X_batch))
                            
                            model_delta.add_delta(k, "bias", -delta,)
                            model_delta.add_delta(k, "bias", -delta,)
                            minus_loss = mse(y_batch.float(), model_delta.predict(X_batch))
                            
                            estimated_grad = (plus_loss - minus_loss) / (2 * h)
                            true_grad = param.bias.backward_value[i][j]
                            diff = estimated_grad - true_grad
                            relative_error = torch.norm(diff) / torch.norm(estimated_grad + true_grad) 
                            
                            if (relative_error > limit):
                                print(f"Concerning relative error detected for bias in layer {k} in entry {i, j}! The error was {relative_error}")
                        
                            
                            model_delta.add_delta(k, "bias", delta)
                            
                            delta[i][j] = 0
                            
            model.update(learning_rate = 0.1)
        
    end_time = time.perf_counter()
    print(f"Training took {end_time - start_time} seconds on the {my_device}")

Using the cpu!
Starting epoch 0
Starting epoch 1
Starting epoch 2
Training took 1.8731855420010106 seconds on the cpu
