In [4]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np

import torch
from torch.utils.data import DataLoader
import torch.utils.data as data_utils


from torch.optim import SGD, Adam
from momo import Momo

import matplotlib.pyplot as plt

from datasets import get_dataset    
from loss_functions import logreg
from utils import solve

torch.set_default_dtype(torch.float64)

%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [5]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

In [8]:
torch.random.manual_seed(0)

batch_size=64

# training 
EPOCHS = 5
loss_function = logreg

train_data, train_target = get_dataset(name="mushrooms", batch_size=batch_size, percentage=1.0, scale=0)
train_data = train_data.to(torch.get_default_dtype())
train_target = train_target.to(torch.get_default_dtype())
train_load = data_utils.TensorDataset(train_data, train_target)
train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)

/home/farshed.abdukhakimov/projects/sps2/datasets


In [14]:
def train_optimizer(optimizer, lr):
    # parameters
    w = torch.zeros(train_data.shape[1], device=device).requires_grad_()
    opt = optimizer([w], lr=lr)

    # save loss and grad size to history
    hist = []
    loss = loss_function(w, train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(loss, w, create_graph=True)
    print(f"Loss: {loss.item()} | GradNorm^2: {(torch.linalg.norm(g) ** 2 ).item()}")
    hist.append([loss.item(), (torch.linalg.norm(g) ** 2).item()])

    def compute_loss(w, data, target):
        loss = loss_function(w, data, target)
        loss.backward()
        return loss

    for step in range(EPOCHS):
        for i, (batch_data, batch_target) in enumerate(train_dataloader):
            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)
            opt.zero_grad()
            if isinstance(opt, Momo):
                closure = lambda: compute_loss(w, batch_data, batch_target)
                opt.step(closure=closure)
            else:
                loss = compute_loss(w, batch_data, batch_target)
                opt.step()

        loss = loss_function(w, train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(loss, w, create_graph=True)
        print(f"Epoch: [{step}/{EPOCHS}] | Loss: {loss.item()} | GradNorm^2: {(torch.linalg.norm(g) ** 2 ).item()}")
        hist.append([loss.item(), (torch.linalg.norm(g) ** 2).item()])

    return hist

In [18]:
hist_sgd = train_optimizer(SGD, 0.1)

Loss: 0.6931471805599453 | GradNorm^2: 6.057377027322154
Epoch: [0/5] | Loss: 0.0067412308688859415 | GradNorm^2: 0.0005006705468246634
Epoch: [1/5] | Loss: 0.003482427450427738 | GradNorm^2: 0.00013121430252664953
Epoch: [2/5] | Loss: 0.0023583187148119174 | GradNorm^2: 5.954730152242412e-05
Epoch: [3/5] | Loss: 0.0017869995997133105 | GradNorm^2: 3.394032275300847e-05
Epoch: [4/5] | Loss: 0.0014403064839403094 | GradNorm^2: 2.1924316228493653e-05


In [19]:
hist_momo = train_optimizer(Momo, 1.0)

Loss: 0.6931471805599453 | GradNorm^2: 6.057377027322154
Epoch: [0/5] | Loss: 0.001048931754401928 | GradNorm^2: 1.1569714907356672e-05
Epoch: [1/5] | Loss: 0.0004181265076955042 | GradNorm^2: 1.7957530172969797e-06
Epoch: [2/5] | Loss: 0.0002672548630970547 | GradNorm^2: 7.255459723659826e-07
Epoch: [3/5] | Loss: 0.00019798505709332553 | GradNorm^2: 3.952586921289689e-07
Epoch: [4/5] | Loss: 0.0001574049080086827 | GradNorm^2: 2.4846185995680393e-07
