In [13]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np

import torch
from torch.utils.data import DataLoader
import torch.utils.data as data_utils
from torch.optim import SGD, Adam, Adagrad, Adadelta, RMSprop
from torch_optimizer import Adahessian
from optimizers import PSPS

import matplotlib.pyplot as plt

from datasets import get_dataset    
from loss_fns import get_loss
from utils import solve

torch.set_default_dtype(torch.float64)

%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [2]:
device = torch.device('cpu')

In [3]:
def logistic_reg(w, X, y):
    return torch.mean(torch.log(1 + torch.exp(-y * (X @ w))))

def nllsq(w, X, y):
    return torch.mean( ( y - (1/(1 + torch.exp(-X @ w ))) )**2 )

def rademacher_old(weights):
    return torch.round(torch.rand_like(weights)) * 2 - 1

def diag_estimate_old(weights, grad, iters):
    Ds = []
    for j in range(iters):
        z = rademacher_old(weights)
        with torch.no_grad():
            hvp = torch.autograd.grad(grad, weights, grad_outputs=z, retain_graph=True)[0]
        Ds.append((hvp*z))

    return torch.mean(torch.stack(Ds), 0)


def load_result(dataset, percent, scale, batch_size, epochs, loss_class, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed):
    
    results_dir = os.getenv("RESULTS_DIR")
    
    directory = f"{results_dir}/{dataset}/percent_{percent}/scale_{scale}/bs_{batch_size}" \
    f"/epochs_{epochs}/{loss_class}/{optimizer_class}/lr_{lr}/precond_{preconditioner}/slack_{slack_method}/lmd_{lmd}/mu_{mu}/seed_{seed}"

    if not os.path.isdir(directory):
        return None

    grad_norm_sq = []
    loss = []
    slack = []

    grad_norm_sq = torch.load(f"{directory}/grad_norm_sq")
    loss = torch.load(f"{directory}/loss")

    if slack_method != "none" and optimizer_class != "bsps2":
        slack = torch.load(f"{directory}/slack")
    else:
        slack = [0 for x in loss]

    return [[x, y, z] for x, y, z in zip(loss, grad_norm_sq, slack)]

def citardouq_solve(a, b, c):
    det = b * b - 4 * a * c
    if det < 1e-40:
        x1 = 0.0
        x2 = 0.0
    else:
        x1 = (2 * c) / (-b - np.sqrt(det))
        x2 = (2 * c) / (-b + np.sqrt(det))
    return np.asarray([x1, x2])

In [4]:
psps_classes = (PSPS)

optimizers_dict = {
    "psps": PSPS,
    "sgd": SGD,
    "adam": Adam,
    "adagrad": Adagrad,
    "adadelta": Adadelta,
    "rmsprop": RMSprop,
    "adahessian": Adahessian,
}

lmd = 0.01
mu = 0.1

def train(seed, loss, train_data, train_target, batch_size, EPOCHS, optimizer_class, **optimizer_kwargs):
    
    torch.random.manual_seed(seed)

    params = torch.zeros(train_data.shape[1], device=device).requires_grad_()

    train_load = data_utils.TensorDataset(train_data, train_target)
    train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)

    criterion = loss(params)
    optimizer = optimizer_class([params], **optimizer_kwargs)

    train_loss = criterion(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(train_loss, params)
    grad_norm_sq = torch.linalg.norm(g).item() ** 2
    slack = 0
    step_size = 0


    hist = [[train_loss.item(), grad_norm_sq, slack, step_size]]
   
    for epoch in range(EPOCHS):
        for i, (batch_data, batch_target) in enumerate(train_dataloader):  
            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            optimizer.zero_grad()
            
            def closure():
                return criterion(batch_data, batch_target)
            
            loss = closure()

            if isinstance(optimizer, psps_classes):     
                optimizer.step(closure) 
                slack = optimizer.replay_buffer[-1]["slack"]
                step_size = optimizer.replay_buffer[-1]["step_size"]
            elif isinstance(optimizer, Adahessian): 
                loss.backward(create_graph=True)    
                optimizer.step()
            else:
                loss.backward()
                optimizer.step()

        train_loss = criterion(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(train_loss, params)
        grad_norm_sq = torch.linalg.norm(g).item() ** 2

        hist.append([train_loss.item(), grad_norm_sq, slack, step_size])

        if epoch % 100 == 0:
            print(f"[{epoch}] / [{EPOCHS}] | Loss: {train_loss.item()} | GradNorm^2: {grad_norm_sq} | s: {slack} | step_size: {step_size}")

    return hist


def main(optimizer_class, lr, scale, preconditioner, 
        slack_method, lmd, mu, seed, modified, save):
    
    np.random.seed(seed)
    n = 1000
    d = 100
    percent = 1.0
    dataset_name = f"synthetic-{n}x{d}"
    A = np.random.randn(n,d)

    if modified:
        U, S, VH = np.linalg.svd(A)
        S *= 0.0
        S = np.asarray([1/((x+1)**2) for x in range(S.shape[0])])
        A = np.dot(U[:, :S.shape[0]] * S, VH)
        dataset_name += "-modified"

    print(dataset_name)

    xopt = np.random.randn(d)
    b = A @ xopt 
    train_data = torch.Tensor(A)
    train_target = torch.Tensor(b)
    xopt = torch.Tensor(xopt)

    if scale != None:
        r1 = -scale
        r2 = scale
        scaling_vec = (r1 - r2) * torch.rand(train_data.shape[1]) + r2
        scaling_vec = torch.pow(torch.e, scaling_vec)
        train_data = scaling_vec * train_data

    EPOCHS = 1000
    train_load = torch.utils.data.TensorDataset(train_data, train_target)
    batch_size = 128
    train_dataloader = torch.utils.data.DataLoader(train_load, batch_size=batch_size, shuffle=True)


    lmd = 0.01
    mu = 0.1

    class MSELoss(torch.nn.Module):
        def __init__(self, params):
            self.params = params
            super().__init__()

        def forward(self, input_, target):
            return 1/2 * torch.mean( torch.norm(input_ @ self.params - target )**2 )

    loss_class = "mse"
    loss = MSELoss

    optimizer = optimizers_dict[optimizer_class]

    if optimizer_class in ("sgd", "adam", "adagrad", "adadelta", "rmsprop", "adahessian"):
        result = train(
            seed,
            loss,
            train_data, 
            train_target, 
            batch_size,
            EPOCHS,
            optimizer,
            lr=lr
        )
    elif  optimizer_class in ("psps", "psps2", "psps2_b"):
       result = train(
            seed,
            loss,
            train_data, 
            train_target, 
            batch_size,
            EPOCHS,
            optimizer,
            preconditioner=preconditioner,
            slack_method=slack_method,
            lmd=lmd,
            mu=mu
        ) 

    if save:
        results_path = os.getenv("RESULTS_DIR")
        directory = f"{results_path}/{dataset_name}/percent_{percent}/scale_{scale}/bs_{batch_size}" \
        f"/epochs_{EPOCHS}/{loss_class}/{optimizer_class}/lr_{lr}/precond_{preconditioner}/slack_{slack_method}/lmd_{lmd}/mu_{mu}/seed_{seed}"
        print(directory)
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        torch.save([x[0] for x in result], f"{directory}/loss")
        torch.save([x[1] for x in result], f"{directory}/grad_norm_sq")
        
        if optimizer_class in ("psps", "psps2", "psps2_b"):
            torch.save([x[2] for x in result], f"{directory}/slack")
            torch.save([x[3] for x in result], f"{directory}/step_size")


In [6]:
optimizer_class = "psps"
lr = 0.1
save = True
modified = False

for seed in [0, 1, 2, 3, 4, 5]:
    for preconditioner in ["none", "hutch"]:
        for slack_method in ["none", "L1", "L2"]:
            print(seed, preconditioner, slack_method)
            main(optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, modified, save)

0 none none
synthetic-1000x100
[0] / [1000] | Loss: 643.1228095939124 | GradNorm^2: 958388.0149619443 | s: 0.0 | step_size: 0.002366662250834929
[100] / [1000] | Loss: 1.3590616157173615e-15 | GradNorm^2: 1.6781128549040162e-12 | s: 0.0 | step_size: 0.002201159710483585
[200] / [1000] | Loss: 1.3590616157173615e-15 | GradNorm^2: 1.6781128549040162e-12 | s: 0.0 | step_size: 0.002201159710483585
[300] / [1000] | Loss: 1.3590616157173615e-15 | GradNorm^2: 1.6781128549040162e-12 | s: 0.0 | step_size: 0.002201159710483585
[400] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.002535240276699443
[500] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.002535240276699443
[600] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.002535240276699443
[700] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size

  torch.tensor(s - self.lmd + step_size_temp))
  torch.tensor(0.0), torch.tensor(loss - s + self.lmd) ) / (1 + gnorm_square)


[100] / [1000] | Loss: 1.3590616157173615e-15 | GradNorm^2: 1.6781128549040162e-12 | s: 0.0 | step_size: 0.002201159710483585
[200] / [1000] | Loss: 1.3590616157173615e-15 | GradNorm^2: 1.6781128549040162e-12 | s: 0.0 | step_size: 0.002201159710483585
[300] / [1000] | Loss: 1.3590616157173615e-15 | GradNorm^2: 1.6781128549040162e-12 | s: 0.0 | step_size: 0.002201159710483585
[400] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.002535240276699443
[500] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.002535240276699443
[600] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.002535240276699443
[700] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.002535240276699443
[800] / [1000] | Loss: 8.372437990597025e-16 | GradNorm^2: 1.0474183255555564e-12 | s: 0.0 | step_size: 0.0025352402766994

  group["step_size"] = torch.max(torch.tensor(0.0), torch.tensor(loss - self.lmd_hat * s)) / (gnorm_square + self.lmd_hat)


[100] / [1000] | Loss: 0.00971935581439133 | GradNorm^2: 10.667293871260703 | s: 0.00142252959675061 | step_size: 0.0
[200] / [1000] | Loss: 0.003740326408003752 | GradNorm^2: 4.071000851815519 | s: 0.0005438801518170964 | step_size: 0.0
[300] / [1000] | Loss: 0.0022647912774342955 | GradNorm^2: 2.446040146849091 | s: 0.0003326231052602654 | step_size: 0.0
[400] / [1000] | Loss: 0.0016079690541572994 | GradNorm^2: 1.731004099849621 | s: 0.0002227773058586957 | step_size: 0.0
[500] / [1000] | Loss: 0.0012477134859057257 | GradNorm^2: 1.3388958908868034 | s: 0.0001884634234010987 | step_size: 0.0
[600] / [1000] | Loss: 0.0010195593045364735 | GradNorm^2: 1.0923314013932441 | s: 0.00015317820161437143 | step_size: 0.0
[700] / [1000] | Loss: 0.000861315123160272 | GradNorm^2: 0.921200683055746 | s: 0.0001192448207163805 | step_size: 0.0
[800] / [1000] | Loss: 0.0007429074292948219 | GradNorm^2: 0.7934449233317165 | s: 0.00011854340518678149 | step_size: 0.0
[900] / [1000] | Loss: 0.0006566

In [None]:
optimizer_class = "psps"
lr = 0.1
save = True
modified = True

for seed in [0, 1, 2, 3, 4, 5]:
    for preconditioner in ["none", "hutch"]:
        for slack_method in ["none", "L1", "L2"]:
            print(seed, preconditioner, slack_method)
            main(optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, modified, save)

0 none none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.012503584262081098 | GradNorm^2: 0.01538341920197431
[100] / [1000] | Loss: 0.0003885561017831141 | GradNorm^2: 0.0006256000599330003
[200] / [1000] | Loss: 7.657733430982813e-05 | GradNorm^2: 4.6299119258424796e-05
[300] / [1000] | Loss: 4.059300599337263e-05 | GradNorm^2: 2.4365863772633643e-06
[400] / [1000] | Loss: 0.0024023692999651544 | GradNorm^2: 0.00474133495132066
[500] / [1000] | Loss: 3.419185463331657e-05 | GradNorm^2: 1.9465650596742855e-05
[600] / [1000] | Loss: 2.8436811534830483e-05 | GradNorm^2: 1.2648448111544145e-05
[700] / [1000] | Loss: 2.226737450157829e-05 | GradNorm^2: 4.335555069657096e-06
[800] / [1000] | Loss: 0.00010016528328413899 | GradNorm^2: 0.0001631255529056886
[900] / [1000] | Loss: 8.164308267254599e-05 | GradNorm^2: 0.00012916687547133153
/home/farshed.abdukhakimov/projects/sps2/results/synthetic-1000x100-modified/percent_1.0/scale_0/bs_128/epochs_1000/mse/psps/lr_0.1/precond_none/slack

  torch.tensor(s - self.lmd + step_size_temp))
  torch.tensor(0.0), torch.tensor(loss - s + self.lmd) ) / (1 + gnorm_square)


[100] / [1000] | Loss: 0.020599656541801365 | GradNorm^2: 0.012018843104084078
[200] / [1000] | Loss: 0.01476657704330741 | GradNorm^2: 0.0025396758695541847
[300] / [1000] | Loss: 0.013104173471538763 | GradNorm^2: 0.0011686636612570987
[400] / [1000] | Loss: 0.012094866794750652 | GradNorm^2: 0.0008984724296081353
[500] / [1000] | Loss: 0.011254634289321762 | GradNorm^2: 0.0007853944561691832
[600] / [1000] | Loss: 0.010509897555677957 | GradNorm^2: 0.0007023301429734793
[700] / [1000] | Loss: 0.009842247819102848 | GradNorm^2: 0.0006310497784167268
[800] / [1000] | Loss: 0.009241715246212966 | GradNorm^2: 0.0005681991376631212
[900] / [1000] | Loss: 0.008700579445707847 | GradNorm^2: 0.0005125167846526797
/home/farshed.abdukhakimov/projects/sps2/results/synthetic-1000x100-modified/percent_1.0/scale_0/bs_128/epochs_1000/mse/psps/lr_0.1/precond_none/slack_L1/lmd_0.01/mu_0.1/seed_0
0 none L2
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.057953083777540396 | GradNorm^2: 0.084239454

  step_size = torch.max(torch.tensor(0.0), torch.tensor(loss - self.lmd_hat * s)) / (gnorm_square + self.lmd_hat)


[100] / [1000] | Loss: 0.0571029117096945 | GradNorm^2: 0.08256626842899767
[200] / [1000] | Loss: 0.05629910521012097 | GradNorm^2: 0.08098460033863399
[300] / [1000] | Loss: 0.05549767907633446 | GradNorm^2: 0.07940778028277086
[400] / [1000] | Loss: 0.05472147637205391 | GradNorm^2: 0.0778812583922283
[500] / [1000] | Loss: 0.053964672032715134 | GradNorm^2: 0.0763937192462584
[600] / [1000] | Loss: 0.053209078738274976 | GradNorm^2: 0.07490816132919847
[700] / [1000] | Loss: 0.0525171458156594 | GradNorm^2: 0.0735486781147643
[800] / [1000] | Loss: 0.05183450920914973 | GradNorm^2: 0.07220789139896057
[900] / [1000] | Loss: 0.051178702759299324 | GradNorm^2: 0.07092008437598973
/home/farshed.abdukhakimov/projects/sps2/results/synthetic-1000x100-modified/percent_1.0/scale_0/bs_128/epochs_1000/mse/psps/lr_0.1/precond_none/slack_L2/lmd_0.01/mu_0.1/seed_0
0 hutch none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.012503584262081265 | GradNorm^2: 0.01538341920197474
[100] / [1000] 

In [5]:
optimizer_class = "sgd"
preconditioner = "none"
slack_method = "none"
save = True
modified = True

for scale in [3, 5]:
    for lr in [0.1, 0.01, 0.001, 0.0001]:
        for seed in [0, 1, 2, 3, 4]:
                print(scale, seed, preconditioner, slack_method)
                main(optimizer_class, lr, scale, preconditioner, slack_method, lmd, mu, seed, modified, save)

3 0 none none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.010519184479212821 | GradNorm^2: 0.04543071523816906 | s: 0 | step_size: 0
[100] / [1000] | Loss: 0.0005266229419559805 | GradNorm^2: 8.907340697605849e-05 | s: 0 | step_size: 0
[200] / [1000] | Loss: 0.0004145732592592672 | GradNorm^2: 1.5744462852453176e-05 | s: 0 | step_size: 0
[300] / [1000] | Loss: 0.0003540980428223387 | GradNorm^2: 4.694282401359894e-06 | s: 0 | step_size: 0
[400] / [1000] | Loss: 0.00031621642987755176 | GradNorm^2: 3.12884358052466e-05 | s: 0 | step_size: 0
[500] / [1000] | Loss: 0.00028855418896363126 | GradNorm^2: 2.116945105770253e-05 | s: 0 | step_size: 0
[600] / [1000] | Loss: 0.0002669429495258345 | GradNorm^2: 8.18492692869307e-06 | s: 0 | step_size: 0
[700] / [1000] | Loss: 0.0002494436367926447 | GradNorm^2: 7.515809552930062e-06 | s: 0 | step_size: 0
[800] / [1000] | Loss: 0.00023460253998240277 | GradNorm^2: 2.642020539010247e-06 | s: 0 | step_size: 0
[900] / [1000] | Loss: 0.000221949

In [6]:
optimizer_class = "adam"
preconditioner = "none"
slack_method = "none"
save = True
modified = True

for scale in [3, 5]:
    for lr in [0.1, 0.01, 0.001, 0.0001]:
        for seed in [0, 1, 2, 3, 4]:
                print(scale, seed, preconditioner, slack_method)
                main(optimizer_class, lr, scale, preconditioner, slack_method, lmd, mu, seed, modified, save)

3 0 none none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.9261230130146552 | GradNorm^2: 82.98329611984957 | s: 0 | step_size: 0
[100] / [1000] | Loss: 0.0001034369558037857 | GradNorm^2: 0.0016209637106834867 | s: 0 | step_size: 0
[200] / [1000] | Loss: 5.14093381187163e-05 | GradNorm^2: 2.2952442041728023e-05 | s: 0 | step_size: 0
[300] / [1000] | Loss: 3.924914658382738e-05 | GradNorm^2: 0.00021993452221065235 | s: 0 | step_size: 0
[400] / [1000] | Loss: 3.9612682735821396e-05 | GradNorm^2: 0.0007497365696607876 | s: 0 | step_size: 0
[500] / [1000] | Loss: 0.00012477361910846678 | GradNorm^2: 0.008801462906572745 | s: 0 | step_size: 0
[600] / [1000] | Loss: 6.605892238097658e-05 | GradNorm^2: 0.0034567829629461944 | s: 0 | step_size: 0
[700] / [1000] | Loss: 0.0009306093200089628 | GradNorm^2: 0.08078284832985187 | s: 0 | step_size: 0
[800] / [1000] | Loss: 2.7974892876685365e-05 | GradNorm^2: 0.0008276756871759989 | s: 0 | step_size: 0
[900] / [1000] | Loss: 2.89673773578986

In [7]:
optimizer_class = "adagrad"
preconditioner = "none"
slack_method = "none"
save = True
modified = True

for scale in [3, 5]:
    for lr in [0.1, 0.01, 0.001, 0.0001]:
        for seed in [0, 1, 2, 3, 4]:
                print(scale, seed, preconditioner, slack_method)
                main(optimizer_class, lr, scale, preconditioner, slack_method, lmd, mu, seed, modified, save)

3 0 none none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.004890936585169861 | GradNorm^2: 0.006884430022637889 | s: 0 | step_size: 0
[100] / [1000] | Loss: 0.00017609593247195114 | GradNorm^2: 0.00014441340366925127 | s: 0 | step_size: 0
[200] / [1000] | Loss: 0.00014923603217435998 | GradNorm^2: 8.358822908043682e-05 | s: 0 | step_size: 0
[300] / [1000] | Loss: 0.00013365217694546682 | GradNorm^2: 3.653662875856805e-05 | s: 0 | step_size: 0
[400] / [1000] | Loss: 0.00012395727621529936 | GradNorm^2: 0.00013874288047175263 | s: 0 | step_size: 0
[500] / [1000] | Loss: 0.000117580782514736 | GradNorm^2: 0.00033267740063456624 | s: 0 | step_size: 0
[600] / [1000] | Loss: 0.00010784743637563022 | GradNorm^2: 9.102931425992835e-05 | s: 0 | step_size: 0
[700] / [1000] | Loss: 0.00010089377689750845 | GradNorm^2: 6.002689894361735e-06 | s: 0 | step_size: 0
[800] / [1000] | Loss: 9.651112737629347e-05 | GradNorm^2: 7.786897960318105e-05 | s: 0 | step_size: 0
[900] / [1000] | Loss: 9.21

In [None]:
optimizers_dict = {
    "psps": PSPS,
    "sgd": SGD,
    "adam": Adam,
    "adagrad": Adagrad,
    "adadelta": Adadelta,
    "rmsprop": RMSprop,
    "adahessian": Adahessian,
}

In [8]:
optimizer_class = "adadelta"
preconditioner = "none"
slack_method = "none"
save = True
modified = True

for scale in [3, 5]:
    for lr in [0.1, 0.01, 0.001, 0.0001]:
        for seed in [0, 1, 2, 3, 4]:
                print(scale, seed, preconditioner, slack_method)
                main(optimizer_class, lr, scale, preconditioner, slack_method, lmd, mu, seed, modified, save)

3 0 none none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.03924459516779757 | GradNorm^2: 2.028094973708312 | s: 0 | step_size: 0
[100] / [1000] | Loss: 0.001132994314270532 | GradNorm^2: 0.0006695880430142447 | s: 0 | step_size: 0
[200] / [1000] | Loss: 0.0006931269759449516 | GradNorm^2: 5.346920505105014e-05 | s: 0 | step_size: 0
[300] / [1000] | Loss: 0.0005005535850263683 | GradNorm^2: 1.5500844872426408e-05 | s: 0 | step_size: 0
[400] / [1000] | Loss: 0.00039846813819160396 | GradNorm^2: 0.00021887519205480234 | s: 0 | step_size: 0
[500] / [1000] | Loss: 0.0003328868350018072 | GradNorm^2: 3.829020080638324e-05 | s: 0 | step_size: 0
[600] / [1000] | Loss: 0.00029081395135991496 | GradNorm^2: 0.00011416592660395551 | s: 0 | step_size: 0
[700] / [1000] | Loss: 0.0002583562262775612 | GradNorm^2: 3.559397739272226e-06 | s: 0 | step_size: 0
[800] / [1000] | Loss: 0.00023466372342691633 | GradNorm^2: 4.011932842907801e-06 | s: 0 | step_size: 0
[900] / [1000] | Loss: 0.000216192

In [9]:
optimizer_class = "rmsprop"
preconditioner = "none"
slack_method = "none"
save = True
modified = True

for scale in [3, 5]:
    for lr in [0.1, 0.01, 0.001, 0.0001]:
        for seed in [0, 1, 2, 3, 4]:
                print(scale, seed, preconditioner, slack_method)
                main(optimizer_class, lr, scale, preconditioner, slack_method, lmd, mu, seed, modified, save)

3 0 none none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.006269154147273324 | GradNorm^2: 0.008614587135331487 | s: 0 | step_size: 0
[100] / [1000] | Loss: 0.032803864904071944 | GradNorm^2: 2.871175527493193 | s: 0 | step_size: 0
[200] / [1000] | Loss: 0.0001846163392529453 | GradNorm^2: 0.0006744438998597218 | s: 0 | step_size: 0
[300] / [1000] | Loss: 0.36941914205591453 | GradNorm^2: 32.499043311696525 | s: 0 | step_size: 0
[400] / [1000] | Loss: 0.01015330059781263 | GradNorm^2: 0.6651334410792223 | s: 0 | step_size: 0
[500] / [1000] | Loss: 0.005792004662700169 | GradNorm^2: 0.5021879603310856 | s: 0 | step_size: 0
[600] / [1000] | Loss: 0.7797170632272992 | GradNorm^2: 70.13258691495359 | s: 0 | step_size: 0
[700] / [1000] | Loss: 0.05782500829507374 | GradNorm^2: 5.16794813032771 | s: 0 | step_size: 0
[800] / [1000] | Loss: 0.07337061192587371 | GradNorm^2: 6.5680400396752985 | s: 0 | step_size: 0
[900] / [1000] | Loss: 0.37787039116110205 | GradNorm^2: 33.9269296895673

In [10]:
optimizer_class = "adahessian"
preconditioner = "none"
slack_method = "none"
save = True
modified = True

for scale in [3, 5]:
    for lr in [0.1, 0.01, 0.001, 0.0001]:
        for seed in [0, 1, 2, 3, 4]:
                print(scale, seed, preconditioner, slack_method)
                main(optimizer_class, lr, scale, preconditioner, slack_method, lmd, mu, seed, modified, save)

3 0 none none
synthetic-1000x100-modified
[0] / [1000] | Loss: 0.01169321577503165 | GradNorm^2: 0.09296042287578879 | s: 0 | step_size: 0


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


[100] / [1000] | Loss: 0.000793195890247194 | GradNorm^2: 3.9720778215471915e-05 | s: 0 | step_size: 0
[200] / [1000] | Loss: 0.0004670347641655172 | GradNorm^2: 2.0923806336871746e-05 | s: 0 | step_size: 0
[300] / [1000] | Loss: 0.00033523980765620047 | GradNorm^2: 4.1704036088051736e-05 | s: 0 | step_size: 0
[400] / [1000] | Loss: 0.00026998286052679955 | GradNorm^2: 3.6874370539070733e-06 | s: 0 | step_size: 0
[500] / [1000] | Loss: 0.0002316888226039092 | GradNorm^2: 6.696203812456296e-06 | s: 0 | step_size: 0
[600] / [1000] | Loss: 0.00020621142162575572 | GradNorm^2: 1.8625792072933062e-06 | s: 0 | step_size: 0
[700] / [1000] | Loss: 0.00018782690069713633 | GradNorm^2: 1.3873080681148406e-06 | s: 0 | step_size: 0
[800] / [1000] | Loss: 0.00017404739106154972 | GradNorm^2: 5.205694394981951e-06 | s: 0 | step_size: 0
[900] / [1000] | Loss: 0.00016327868822612048 | GradNorm^2: 1.2627566995783504e-06 | s: 0 | step_size: 0
/home/farshed.abdukhakimov/projects/sps2/results/synthetic-10

In [35]:
def train_sp2plus(seed, loss, train_data, train_target, batch_size, EPOCHS):
    
    torch.random.manual_seed(seed)

    params = torch.zeros(train_data.shape[1], device=device).requires_grad_()

    train_load = data_utils.TensorDataset(train_data, train_target)
    train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)

    criterion = loss(params)

    train_loss = criterion(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(train_loss, params)
    grad_norm_sq = torch.linalg.norm(g).item() ** 2
    slack = 0


    hist = [[train_loss.item(), grad_norm_sq, slack]]
   
    for epoch in range(EPOCHS):
        for i, (batch_data, batch_target) in enumerate(train_dataloader):  
            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            loss = criterion(batch_data, batch_target)
            g, = torch.autograd.grad(loss, params, create_graph=True)
            f_grad = g.clone().detach()

            loss_closure = lambda w: criterion(batch_data, batch_target)
            hgp = torch.autograd.functional.hvp(loss_closure, params, g, create_graph=True)[1]

            with torch.no_grad():
                gnormsq = torch.norm(f_grad)**2
                sps_step = loss.item() / gnormsq
                params.sub_(sps_step * f_grad)
                gdiffHgp = torch.sub(f_grad, hgp, alpha=sps_step)
                if torch.norm(gdiffHgp)**2 > 1e-10:
                        params.sub_(0.5 * (sps_step**2) * gdiffHgp * torch.dot(f_grad, gdiffHgp)/ (torch.norm(gdiffHgp)**2))


        train_loss = criterion(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(train_loss, params)
        grad_norm_sq = torch.linalg.norm(g).item() ** 2

        hist.append([train_loss.item(), grad_norm_sq, slack])

        if epoch % 100 == 0:
            print(f"[{epoch}] / [{EPOCHS}] | Loss: {train_loss.item()} | GradNorm^2: {grad_norm_sq}")

    return hist

In [50]:
def train_psps2_DD(seed, loss, train_data, train_target, batch_size, EPOCHS):
    
    torch.random.manual_seed(seed)

    alpha = 1e-4
    beta=0.999

    w = torch.zeros(train_data.shape[1], device=device).requires_grad_()

    train_load = data_utils.TensorDataset(train_data, train_target)
    train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)

    criterion = loss(w)

    train_loss = criterion(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(train_loss, w, create_graph=True)
    grad_norm_sq = torch.linalg.norm(g).item() ** 2
    slack = 0

    lmd_star = torch.tensor(0.0)

    # preconditioninig matrix
    Dk = diag_estimate_old(w, g, 100)

    hist = [[train_loss.item(), grad_norm_sq, slack]]
   
    for epoch in range(EPOCHS):
        for i, (batch_data, batch_target) in enumerate(train_dataloader):  
            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            loss = criterion(batch_data, batch_target)
            g, = torch.autograd.grad(loss, w, create_graph=True)
            f_grad = g.clone().detach()

            vk = diag_estimate_old(w, g, 1)
            # Smoothing and Truncation 
            Dk = beta * Dk + (1 - beta) * vk
            Dk_hat = torch.abs(Dk)
            Dk_hat[Dk_hat < alpha] = alpha
            Dk_hat_inv = 1 / Dk_hat 

            gnorm_sq = (f_grad * Dk_hat_inv).dot(f_grad)
            det = 1 - (2 * loss.item() / gnorm_sq )
            if det <= 1e-40:
                continue
            else:
                t = torch.sqrt(det)/det
                root1 = -1 + t
                root2 = -1 - t
                lmd_star = torch.maximum(root1, root2)

            precond = lmd_star/(1 + lmd_star) * Dk_hat_inv

            with torch.no_grad():
                w.sub_(precond * f_grad)


        train_loss = criterion(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(train_loss, w)
        grad_norm_sq = torch.linalg.norm(g).item() ** 2

        hist.append([train_loss.item(), grad_norm_sq, slack])

        if epoch % 100 == 0:
            print(f"[{epoch}] / [{EPOCHS}] | Loss: {train_loss.item()} | GradNorm^2: {grad_norm_sq}")

    return hist

In [51]:
def train_psps2_DB(seed, loss, train_data, train_target, batch_size, EPOCHS):
    
    torch.random.manual_seed(seed)

    alpha = 1e-4
    beta=0.999

    w = torch.zeros(train_data.shape[1], device=device).requires_grad_()

    train_load = data_utils.TensorDataset(train_data, train_target)
    train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)

    criterion = loss(w)

    train_loss = criterion(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(train_loss, w, create_graph=True)
    grad_norm_sq = torch.linalg.norm(g).item() ** 2
    slack = 0

    lmd_star = torch.tensor(0.0)

    # preconditioninig matrix
    Dk = diag_estimate_old(w, g, 100)
    
    hist = [[train_loss.item(), grad_norm_sq, slack]]
   
    for epoch in range(EPOCHS):
        for i, (batch_data, batch_target) in enumerate(train_dataloader):  
            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            loss = criterion(batch_data, batch_target)
            g, = torch.autograd.grad(loss, w, create_graph=True)
            f_grad = g.clone().detach()

            vk = diag_estimate_old(w, g, 1)
            # Smoothing and Truncation 
            Dk = beta * Dk + (1 - beta) * vk
            Dk_hat = torch.abs(Dk)
            Dk_hat[Dk_hat < alpha] = alpha
            Dk_hat_inv = 1 / Dk_hat 

            sk = f_grad.clone()
            yk = torch.autograd.grad(g, w, grad_outputs=sk, retain_graph=True)[0]
            gnorm = (g * Dk_hat_inv).dot(g)

            if gnorm < 1e-25:
                continue

            D_inv = torch.diagflat(Dk_hat_inv.clone().detach())

            D_inv_B = (((Dk_hat_inv * yk).reshape(-1, 1) @ yk.reshape(1, -1)) / (yk.dot(sk)))
            D_inv_B_D_inv = D_inv_B * Dk_hat_inv

            a_torch = torch.dot(f_grad, Dk_hat_inv*f_grad)
            a = a_torch.cpu().detach().numpy()

            b_torch = torch.dot(f_grad, D_inv_B_D_inv@f_grad)
            b = b_torch.cpu().detach().numpy() 

            c_torch = torch.trace(D_inv_B)
            c = c_torch.cpu().detach().numpy()

            AA = 2 * c**2 * ( torch.dot(f_grad, (D_inv - (1/c) * D_inv_B_D_inv ) @ f_grad) )
            BB = c_torch * ( torch.dot(f_grad, (4 * D_inv - (3/c) * D_inv_B_D_inv) @ f_grad) )
            CC = 2 * a - 4 * c * loss.item()
            DD = - 2 * loss.item()

            BB = BB / AA
            CC = CC / AA
            DD = DD / AA
            AA = 1.0
            
            def lagr(lmd):
                b = lambda lmd: D_inv - (lmd * (D_inv_B_D_inv) / (1 + lmd * c_torch)) 
                return lmd * loss -  (1/2)*lmd**2 * torch.dot(f_grad, b(lmd)@f_grad)

            lmd_star_old = lmd_star
            lmds = solve(AA, BB, CC, DD)
            lmds = torch.from_numpy(lmds).to(device)
            lmd_max = torch.max(lmds)
            lmd_min = torch.maximum(torch.min(lmds), torch.tensor(0))
            lmd_star = lmd_max
            if lagr(lmd_max) < lagr(lmd_min):
                lmd_star = lmd_min

            if lmd_star > 1e8:
                print(f"lmd_star is: {lmd_star}")
                lmd_star = lmd_star_old
            
            precond = lmd_star * ( D_inv - (lmd_star * (D_inv_B_D_inv) / (1 + lmd_star * c_torch)) )

            with torch.no_grad():
                w.sub_(precond @ f_grad)


        train_loss = criterion(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(train_loss, w)
        grad_norm_sq = torch.linalg.norm(g).item() ** 2

        hist.append([train_loss.item(), grad_norm_sq, slack])

        if epoch % 100 == 0:
            print(f"[{epoch}] / [{EPOCHS}] | Loss: {train_loss.item()} | GradNorm^2: {grad_norm_sq}")

    return hist

In [55]:
def train_psps2_BB(seed, loss, train_data, train_target, batch_size, EPOCHS):
    
    torch.random.manual_seed(seed)

    alpha = 1e-4
    beta=0.999

    w = torch.zeros(train_data.shape[1], device=device).requires_grad_()

    train_load = data_utils.TensorDataset(train_data, train_target)
    train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)

    criterion = loss(w)

    train_loss = criterion(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(train_loss, w, create_graph=True)
    f_grad = g.clone().detach()
    grad_norm_sq = torch.linalg.norm(g).item() ** 2
    slack = 0

    lmd_star = torch.tensor(0.0)

    # preconditioninig matrix
    s = torch.zeros_like(w) # s = H_inv * grad
    r = f_grad - torch.autograd.grad(g, w, grad_outputs=s, retain_graph=True)[0]
    p = r.detach().clone()
    r_prev = torch.dot(r, r)
    MAX_ITER = 100


    hist = [[train_loss.item(), grad_norm_sq, slack]]
   
    for epoch in range(EPOCHS):
        for i, (batch_data, batch_target) in enumerate(train_dataloader):  
            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            loss = criterion(batch_data, batch_target)
            g, = torch.autograd.grad(loss, w, create_graph=True)
            f_grad = g.clone().detach()

            s = torch.zeros_like(w) # s = H_inv * grad
            r = f_grad.clone()
            p = r.detach().clone()

            for cg_step in range(MAX_ITER):
                hvp = torch.autograd.grad(g, w, grad_outputs=p, retain_graph=True)[0]
                alpha_k = torch.dot(r, r) / torch.dot(p, hvp)
                s = s + alpha_k * p
                r_prev = r.clone()
                r = r - alpha_k * hvp
                if torch.norm(r) < 1e-10:
                    Ax = torch.autograd.grad(g, w, grad_outputs=s, retain_graph=True)[0]    
                    diff = torch.norm(Ax - f_grad)
                    # print(f"Took {cg_step} to reach diff={diff}")
                    break

                beta_k = torch.dot(r, r) / torch.dot(r_prev, r_prev)
                p = r + beta_k * p
            
            gnorm_sq = torch.dot(f_grad, s)
            det = (-gnorm_sq) / (2 * loss.item() - gnorm_sq)

            if det < 0.0:
                print(f"det: {det}")
                continue
            root_1 = -1 + torch.sqrt(det)
            root_2 = -1 - torch.sqrt(det)
            lmd_star = torch.maximum(torch.tensor(0.0), torch.maximum(root_1, root_2))

            step = lmd_star / (1 + lmd_star) * s 
            with torch.no_grad():
                w.sub_(step)

        train_loss = criterion(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(train_loss, w)
        grad_norm_sq = torch.linalg.norm(g).item() ** 2

        hist.append([train_loss.item(), grad_norm_sq, slack])

        if epoch % 100 == 0:
            print(f"[{epoch}] / [{EPOCHS}] | Loss: {train_loss.item()} | GradNorm^2: {grad_norm_sq}")

    return hist

In [53]:
def main2(optimizer_class, lr, preconditioner, 
        slack_method, lmd, mu, seed, modified, save):
    
    optimizers_run = {
        "sp2plus": train_sp2plus,
        "psps2_DD": train_psps2_DD,
        "psps2_BB": train_psps2_BB,
        "psps2_DB": train_psps2_DB,
    }
    
    np.random.seed(seed)
    n = 1000
    d = 100
    percent = 1.0
    scale = 0
    dataset_name = f"synthetic-{n}x{d}"
    A = np.random.randn(n,d)

    if modified:
        U, S, VH = np.linalg.svd(A)
        S *= 0.0
        S = np.asarray([1/((x+1)**2) for x in range(S.shape[0])])
        A = np.dot(U[:, :S.shape[0]] * S, VH)
        dataset_name += "-modified"

    print(dataset_name)

    xopt = np.random.randn(d)
    b = A @ xopt 
    train_data = torch.Tensor(A)
    train_target = torch.Tensor(b)
    xopt = torch.Tensor(xopt)

    EPOCHS = 1000
    train_load = torch.utils.data.TensorDataset(train_data, train_target)
    batch_size = 128
    train_dataloader = torch.utils.data.DataLoader(train_load, batch_size=batch_size, shuffle=True)


    lmd = 0.01
    mu = 0.1

    class MSELoss(torch.nn.Module):
        def __init__(self, params):
            self.params = params
            super().__init__()

        def forward(self, input_, target):
            return 1/2 * torch.mean( torch.norm(input_ @ self.params - target )**2 )

    loss_class = "mse"
    loss = MSELoss

    result = optimizers_run[optimizer_class](seed, loss, train_data, train_target, batch_size, EPOCHS)

    if save:
        results_path = os.getenv("RESULTS_DIR")
        directory = f"{results_path}/{dataset_name}/percent_{percent}/scale_{scale}/bs_{batch_size}" \
        f"/epochs_{EPOCHS}/{loss_class}/{optimizer_class}/lr_{lr}/precond_{preconditioner}/slack_{slack_method}/lmd_{lmd}/mu_{mu}/seed_{seed}"
        print(directory)
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        torch.save([x[0] for x in result], f"{directory}/loss")
        torch.save([x[1] for x in result], f"{directory}/grad_norm_sq")
        
        if optimizer_class in ("psps", "psps2", "psps2_b"):
            torch.save([x[2] for x in result], f"{directory}/slack")


In [43]:
optimizer_class = "sp2plus"
preconditioner = "none"
slack_method = "none"
lr = 0.1
save = True

for modified in (True):
    for seed in [0, 1, 2, 3, 4]:
            print(seed, modified)
            main2(optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, modified, save)

0 False
synthetic-1000x100
[0] / [1000] | Loss: 640.8371278593952 | GradNorm^2: 955326.3439309997
[100] / [1000] | Loss: 1.9014259525188078e-27 | GradNorm^2: 1.289800293758974e-24
[200] / [1000] | Loss: 1.905055945277989e-27 | GradNorm^2: 1.0242174068231826e-24
[300] / [1000] | Loss: 1.7600596131128737e-27 | GradNorm^2: 1.0641927477807814e-24
[400] / [1000] | Loss: 1.9352052229994048e-27 | GradNorm^2: 1.4150730429863362e-24
[500] / [1000] | Loss: 1.8529726366059363e-27 | GradNorm^2: 1.3425913020190235e-24
[600] / [1000] | Loss: 2.0618358872148433e-27 | GradNorm^2: 1.568936618080339e-24
[700] / [1000] | Loss: 2.0269534440621017e-27 | GradNorm^2: 1.7941421110820753e-24
[800] / [1000] | Loss: 2.015514960936397e-27 | GradNorm^2: 1.678325480061476e-24
[900] / [1000] | Loss: 1.651344719612103e-27 | GradNorm^2: 9.249716057330488e-25
/home/farshed.abdukhakimov/projects/sps2/results/synthetic-1000x100/percent_1.0/scale_0/bs_128/epochs_1000/mse/sp2plus/lr_0.1/precond_none/slack_none/lmd_0.01/mu_

In [60]:
optimizer_class = "psps2_DD"
preconditioner = "none"
slack_method = "none"
lr = 0.1
save = True

for modified in (False, True):
    for seed in [0, 1, 2, 3, 4, 5]:
            print(seed, modified)
            main2(optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, modified, save)

0 False
synthetic-1000x100
[0] / [1000] | Loss: 48759.842141441084 | GradNorm^2: 100330785.65777351
[100] / [1000] | Loss: 48759.842141441084 | GradNorm^2: 100330785.65777351
[200] / [1000] | Loss: 48759.842141441084 | GradNorm^2: 100330785.65777351
[300] / [1000] | Loss: 1.4214054055728966e-16 | GradNorm^2: 2.0774631049436802e-13
[400] / [1000] | Loss: 2.4822001420844903e-27 | GradNorm^2: 2.33991012437125e-24
[500] / [1000] | Loss: 2.0229659987052423e-27 | GradNorm^2: 1.4495773481031068e-24
[600] / [1000] | Loss: 2.4680622755487325e-27 | GradNorm^2: 2.353658009447116e-24
[700] / [1000] | Loss: 2.575143980456662e-27 | GradNorm^2: 2.673743870083272e-24
[800] / [1000] | Loss: 2.1220604869478103e-27 | GradNorm^2: 1.5916184405864694e-24
[900] / [1000] | Loss: 2.673696126826891e-27 | GradNorm^2: 2.3834720533323082e-24
/home/farshed.abdukhakimov/projects/sps2/results/synthetic-1000x100/percent_1.0/scale_0/bs_128/epochs_1000/mse/psps2_DD/lr_0.1/precond_none/slack_none/lmd_0.01/mu_0.1/seed_0
1

In [61]:
optimizer_class = "psps2_BB"
preconditioner = "none"
slack_method = "none"
lr = 0.1
save = True

for modified in (False, True):
    for seed in [0, 1, 2, 3, 4, 5]:
            print(seed, modified)
            main2(optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, modified, save)

0 False
synthetic-1000x100
det: -1597466505168.0876
det: -4081417997665.5186
det: -3711.916057829514
det: -2061.2029462954874
det: -3088.428317984737
det: -1488.3597453152563
[0] / [1000] | Loss: 3.600903569390952e-18 | GradNorm^2: 7.424116124331386e-15
det: -3550.5314921094323
det: -5782.109366925915
det: -2688.137960459415
det: -1945.1808883203141
det: -5765.769915561527
det: -6594.753978394391
det: -1791.1669460258913
det: -816.5421375652484
det: -5629.543062320349
det: -3377.8360730696504
det: -2640.038179326343
det: -4311.858830168716
det: -4291.3187276682
det: -3118.89542782113
det: -4047.150830539331
det: -1276.7308024074653
det: -5267.847784479342
det: -2139.866166879071
det: -4234.30953589279
det: -4448.068244558731
det: -3296.5172377422355
det: -2391.98183914026
det: -1951.8156291372934
det: -1576.985047868559
det: -2436.5128092249547
det: -3709.8775841756774
det: -3894.6932286743395
det: -5802.69003562129
det: -2259.196627047416
det: -3379.5886085413995
det: -2418.3161237622

In [62]:
optimizer_class = "psps2_DB"
preconditioner = "none"
slack_method = "none"
lr = 0.1
save = True

for modified in (False, True):
    for seed in [0, 1, 2, 3, 4, 5]:
            print(seed, modified)
            main2(optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, modified, save)

0 False
synthetic-1000x100
[0] / [1000] | Loss: 247.95744522422476 | GradNorm^2: 394598.73118175974
[100] / [1000] | Loss: 5.756714921040663e-25 | GradNorm^2: 6.937496428239475e-22
[200] / [1000] | Loss: 3.408470078306347e-25 | GradNorm^2: 4.1670452917595835e-22
[300] / [1000] | Loss: 2.244045746507628e-25 | GradNorm^2: 2.7381223605284424e-22
[400] / [1000] | Loss: 1.3628600122060097e-25 | GradNorm^2: 1.680130079878938e-22
[500] / [1000] | Loss: 1.3628600122060097e-25 | GradNorm^2: 1.680130079878938e-22
[600] / [1000] | Loss: 1.3628600122060097e-25 | GradNorm^2: 1.680130079878938e-22
[700] / [1000] | Loss: 1.3628600122060097e-25 | GradNorm^2: 1.680130079878938e-22
[800] / [1000] | Loss: 1.3628600122060097e-25 | GradNorm^2: 1.680130079878938e-22
[900] / [1000] | Loss: 1.3628600122060097e-25 | GradNorm^2: 1.680130079878938e-22
/home/farshed.abdukhakimov/projects/sps2/results/synthetic-1000x100/percent_1.0/scale_0/bs_128/epochs_1000/mse/psps2_DB/lr_0.1/precond_none/slack_none/lmd_0.01/mu_