In [None]:
# Variables clear-er
%reset -f

In [None]:
# Imports and dataset
import torch
import math
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
from utils import get_optimizer

import numpy as np
import matplotlib.pyplot as plt
from collections import deque

# MNIST Handwriting Dataset
train = datasets.MNIST('', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor()
                       ]))

test = datasets.MNIST('', train=False, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor()
                       ]))

batch_size = 64
trainset = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

print(f"Num training batches: {len(trainset)}")

In [None]:
# Simple MLP network for MNIST classification
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)
    
loss_function = nn.CrossEntropyLoss()

In [None]:
def register_nan_checks(model):
    """
    Registers backward hooks for each parameter in the model

    Args:
        model (nn.module): model to register hooks for
    """
    def check_grad(module, grad_input, grad_output):
        """
        Checks that if any gradients are NaN during training
        """
        # print(module)
        if any(np.all(np.isnan(gi.data.cpu().numpy())) for gi in grad_input if gi is not None):
            print('NaN gradient in ' + type(module).__name__)
    model.apply(lambda module: module.register_backward_hook(check_grad))


In [None]:
# Initialize network, hooks, and optimizer
import torchvision
from collections import deque
net = Net()
register_nan_checks(net)

losses = []
epochs = 4
avg_grads = []
pre_gaus_grads_total = deque() 
post_gaus_grads_total = deque()
biases = deque()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)

grad_sum = 0
grad_sq_sum = 0
num_grads = 0

optimizer_name = "SGD"
optimizer = get_optimizer(f"{optimizer_name}", net.parameters())


# Train for certain number of epochs
for epoch in range(epochs):
    print('Epoch:', epoch)
    # Loop over the dataset
    epoch_loss = 0
    # Mini-batch gradient descent
    for X, y in trainset:
        X, y = X.to(device), y.to(device)
        grads = np.zeros(8)
        net.zero_grad()
        output = net(X.view(-1,784))
        loss = F.nll_loss(output, y)
        epoch_loss += loss
        loss.backward()
        optimizer.step()
        # For each mini-batch step, get the gradients of the weight matrices
        for i, param in enumerate(net.named_parameters()):
                name, weight = param
                gradient = weight.grad.data
                # Store the original and gaussian blurred gradients 
                if "weight" in name:
                    pre_gaus_grads_total.append(np.array(gradient.cpu().detach().numpy())) 
                    gaus_grad = torchvision.transforms.functional.gaussian_blur(gradient.unsqueeze(0), kernel_size=(3,3))
                    post_gaus_grads_total.append(gaus_grad.cpu().detach().numpy()) 
                elif "bias" in name:
                    biases.append(gradient)
    print("Loss:", epoch_loss.item()/len(trainset))
    losses.append(loss)


In [None]:
# Separating the total gaussian blur gradients into layers: np.array[step_num, input_dim, output_dim]
pre_gaus_layer1 = []
post_gaus_layer1 = []
pre_gaus_layer2 = []
post_gaus_layer2 = []
pre_gaus_layer3 = []
post_gaus_layer3 = []
pre_gaus_layer4 = []
post_gaus_layer4 = []

for i in range(len(pre_gaus_grads_total)):
    if i % 4 == 0:
        pre_gaus_layer1.append(pre_gaus_grads_total[i])
        post_gaus_layer1.append(post_gaus_grads_total[i])
    elif i % 4 == 1:
        pre_gaus_layer2.append(pre_gaus_grads_total[i])
        post_gaus_layer2.append(post_gaus_grads_total[i])
    elif i % 4 == 2:
        pre_gaus_layer3.append(pre_gaus_grads_total[i])
        post_gaus_layer3.append(post_gaus_grads_total[i])
    else:
        pre_gaus_layer4.append(pre_gaus_grads_total[i])
        post_gaus_layer4.append(post_gaus_grads_total[i])

pre_gaus_layer1 = np.array(pre_gaus_layer1).squeeze()
post_gaus_layer1 = np.array(post_gaus_layer1).squeeze()
pre_gaus_layer2 = np.array(pre_gaus_layer2).squeeze()
post_gaus_layer2 = np.array(post_gaus_layer2).squeeze()
pre_gaus_layer3 = np.array(pre_gaus_layer3).squeeze()
post_gaus_layer3 = np.array(post_gaus_layer3).squeeze()
pre_gaus_layer4 = np.array(pre_gaus_layer4).squeeze()
post_gaus_layer4 = np.array(post_gaus_layer4).squeeze()


In [None]:
def grad_min_max(array, curr_min, curr_max):
    r"""
    Compares current minimum and maximum of array with current minimum and maximum

    Args:
        array(np.array): A numpy array
        curr_min (float): The current minimum
        curr_max (float): The current maximum
    
    Returns:
        min (float): The minimum value, either curr_min or minimum value of array
        max (float): The maximum value, either curr_max or minimum value of array
    """
    if np.max(array) >= curr_max:
        curr_max = np.max(array)
    if np.min(array) <= curr_min:
        curr_min = np.min(array)

    return curr_min, curr_max

# Calculate min and max grad
min_grad, max_grad = grad_min_max(pre_gaus_layer1, 0, 0)
min_grad, max_grad = grad_min_max(pre_gaus_layer2, min_grad, max_grad)
min_grad, max_grad = grad_min_max(pre_gaus_layer3, min_grad, max_grad)
min_grad, max_grad = grad_min_max(pre_gaus_layer4, min_grad, max_grad)
print("Min Grad:", min_grad)
print("Max Grad:", max_grad)
# Calculate average and std grad
# avg_grad = grad_sum / num_grads
# std_grad = math.sqrt((grad_sq_sum / num_grads) - avg_grad**2)
# print("Average Grad:", avg_grad)
# print("Std Grad:", std_grad)

In [None]:
# "Interesting" Gradients
def interesting_gradients(layer, min, max):
    r"""
    Returns the indices of "interesting gradients" that is not in the range of [min, max]
    
    Args:
        layer ([num_steps, output_dim, input_dim,]): Layer that is passed through to observe "interesting" gradients
        min (int): Minimum value for "uninteresting gradients"
        max (int): Maximum value for "uninteresting gradients"

    Returns:
        final (np.array[step_num, output_dim, input_dim]): A 3D array containing all "interesting dimensions"
    """
    final = []

    wheres = np.concatenate((np.array(np.where(layer > max)), np.array(np.where(layer < min))), axis=1)
    
    for i in range(wheres[0].size):
        final.append(np.array([wheres[0][i], wheres[1][i], wheres[2][i]]))
    
    if len(final) == 0:
        return "Nothing to see!"
    
    return np.array(final)

In [None]:
# Saving the arrays with np.save()
saved_grads_dir = "./grads/"
np.save(saved_grads_dir + f"{optimizer_name}_layer1", post_gaus_layer1)
np.save(saved_grads_dir + f"{optimizer_name}_layer2", post_gaus_layer2)
np.save(saved_grads_dir + f"{optimizer_name}_layer3", post_gaus_layer3)
np.save(saved_grads_dir + f"{optimizer_name}_layer4", post_gaus_layer4)

In [None]:
# Checking Gaussian filters... pretty sure it works since the second one looks blurrier
import seaborn as sns
cur_layer = pre_gaus_layer2
gauss_cur_layer = post_gaus_layer2
epoch = 900

avg_grad = np.average(cur_layer)
std_grad = np.std(cur_layer)
print("Average Grad:", avg_grad)
print("Standard Deviation of Grad:", std_grad)
sns.heatmap(data = cur_layer[epoch], vmin=avg_grad-std_grad, vmax=avg_grad+std_grad)
plt.show()
sns.heatmap(data = gauss_cur_layer[epoch], vmin=avg_grad-std_grad, vmax=avg_grad+std_grad)
plt.show()


In [None]:
# Visualizing the gradients through the steps
import os
os.environ["IMAGEIO_FFMPEG_EXE"] = "/usr/bin/ffmpeg" # Setting directory for ffmpeg. Must set prior to importing moviepy
from moviepy.editor import VideoClip
from moviepy.video.io.bindings import mplfig_to_npimage
import matplotlib.pyplot as plt
import seaborn as sns

def make_gif(array, save_directory, file_name):
    r"""
    Makes a gif from a seaborn heat map

    Args:
        array (np.array[step_num, output_dim, input_dim]): The layer that will be visualized
        save_directory (str): Directory the gif will be placed in
        file_name (str): Name of the file
    """
    fps = 60 # Number of frames per second in the gif
    duration = array.shape[0] // fps # Duration of the gif in seconds. 
    fig, ax = plt.subplots()
    def make_frame(t):
        r"""
        Creates a numpyarray of shape (W x H x 3) that is saved and spliced together in the VideoClip class 
        Args:
            t (int): The time that is being calculated. This is implicitly set in the VideoClip class calculated as: duration / fps * (t)
        """
        plt.clf()
        plt.title(f"Step: {int(t*60)}")
        sns.heatmap(data = array[int(t*60)], vmin = min_grad, vmax = max_grad) # frame number = t*fps
        return mplfig_to_npimage(fig)
    animation = VideoClip(make_frame, duration=duration) # Calls make_frame() function duration*fps times
    animation.write_gif(f"{save_directory}/{file_name}.gif", fps=fps) 
    return

for i in range(4):
    make_gif(post_gaus_layer1, "/mnt/c/Users/Jeffrey/Downloads", "post_gaus_layer1")
    make_gif(post_gaus_layer2, "/mnt/c/Users/Jeffrey/Downloads", "post_gaus_layer2")
    make_gif(post_gaus_layer3, "/mnt/c/Users/Jeffrey/Downloads", "post_gaus_layer3")
    make_gif(post_gaus_layer4, "/mnt/c/Users/Jeffrey/Downloads", "post_gaus_layer4")
