# Visualization of training and test curves with different optimizers
This notebook is modified from https://github.com/Luolc/AdaBound/blob/master/demos/cifar10/visualization.ipynb.
We compare the performace of AdaBelief optimizer and 8 other optimizers (SGDM, AdaBound, Yogi, Adam, MSVAG, RAdam, AdamW, Fromage).
The training setting is the same as the official implementation of AdaBound: https://github.com/Luolc/AdaBound,
hence we exactly reproduce the results of AdaBound.
AdaBound is claimed to achieve "fast convergence and good generalization", and in this project we will show that AdaBelief outperforms AdaBound and other optimizers.

In [1]:
import os
%matplotlib notebook
import matplotlib.pyplot as plt
import torch
import numpy as np

params = {'axes.labelsize': 20,
          'axes.titlesize': 20,
         }
plt.rcParams.update(params)

In [6]:
def get_data(names):
    folder_path = './probecurve'
    paths = [os.path.join(folder_path, name) for name in names]
    return {name: torch.load(fp) for name, fp in zip(names, paths)}

def plot(names, curve_type='train', labels = None, ylim=None, loc = 'upper left'):
    plt.figure()
    if ylim is not None: plt.ylim(ylim)# if curve_type == 'train' else 96)
    curve_data = get_data(names)
    for i, label in zip(curve_data.keys(),labels):
        loss = np.array(curve_data[i]['{}_loss'.format(curve_type.lower())])
        if label == 'AdaBelief':
            plt.plot(loss, '-', label=label)
        else:
            plt.plot(loss, '--',label = label)
    
    plt.grid()
    plt.legend(fontsize=14, loc=loc)
    plt.title('{} loss ~ Training epoch'.format(curve_type))
    plt.xlabel('Training Epoch')
    plt.ylabel('Loss')
    #plt.xlim([0, 200])
    plt.show()

In [8]:
data = torch.load('./probecurve/ab-clip.5-00')

In [11]:
newton_cap_logs = [d for log in data["nc_logs"] for d in log]
zipped = tuple(zip(*newton_cap_logs))
if len(zipped) == 4: delt_sqnorm, grad_sqnorm, delt_dot_grad, loss = zipped
else:
    delt_sqnorm, grad_sqnorm, delt_dot_grad, _, loss = zipped # buffered loss
    #loss = [l if l is not None else 0 for l in loss]    
    print("buf")
delt_sqnorm = np.array(delt_sqnorm)
grad_sqnorm = np.array(grad_sqnorm)
delt_dot_grad = np.array(delt_dot_grad)
loss = np.array(loss)

idx = np.flatnonzero(loss == None)
loss[idx] = loss[idx+1]
print("loss <= 0", (loss <= 0).any())

dg_ang = delt_dot_grad / (delt_sqnorm**.5 * grad_sqnorm**.5)
newton_ratio = - loss / delt_dot_grad
#newton_cap = newton_ratio * ((0 < newton_ratio) & (newton_ratio < 1)).astype(float)
#newton_cap = np.maximum(0., np.minimum(1., newton_ratio))
newton_cap = np.where((0 < newton_ratio) & (newton_ratio < 1), newton_ratio, 1) # outside (0,1), acts like scale by 1
print("ratio <= 0", (newton_ratio <= 0).any())

# w = 800
# loss = np.convolve(loss, np.ones(w), 'valid')/w
# delt_dot_grad = np.convolve(delt_dot_grad, np.ones(w), 'valid')/w
# newton_cap = np.convolve(newton_cap, np.ones(w), 'valid')/w

qtys = {
    "ddg": delt_dot_grad,
    "dn": -delt_sqnorm,
    "loss": loss,
    "ang": dg_ang,
    "nr": newton_ratio,
    "nc": newton_cap,
}

# gradnorm doesn't get smaller after lr adjust
# at the end, cap is highly correlated with dtg (dtg dependence version)
# at the end, angle between d and g gets bigger on average (dtg dependence version)

plt.close()
plt.figure(figsize=(12,3))
keys = ["nc","loss","ddg"]
for k in keys:
    plt.plot(np.array(qtys[k]))
plt.legend(keys,loc='upper left')
# bars = np.arange(0,len(loss),390*20)
# for b in bars:
#     #plt.plot([b,b],[-1,5],'k:')
#     plt.plot([b,b],[0, max([np.max(v) for v in qtys.values()])],'k:')
#     plt.text(b,-1,str(int(b/390)))
if "nr" in qtys.keys(): plt.plot([0,len(loss)],[1,1],'k--')
plt.plot([0, len(loss)], [0,0], 'k-', zorder=-1)
#plt.ylim([0, .25])
#plt.xlim([50000,90500])
#plt.ylim([-10, 10])
plt.show()

buf
loss <= 0 False
ratio <= 0 False


<IPython.core.display.Javascript object>

In [7]:
names = [
        'ab-clip.5-00',
]
labels = [
          'ab-clip.5-00',
        ]
plot(names, 'Train', labels)
plot(names, 'Val', labels) 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>