In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np

import torch
from torch.utils.data import DataLoader
import torch.utils.data as data_utils
from optimizers import PSPS

import matplotlib.pyplot as plt

from datasets import get_dataset    
from loss_fns import get_loss
from utils import solve

torch.set_default_dtype(torch.float64)

%load_ext line_profiler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cpu')

In [3]:
def logistic_reg(w, X, y):
    return torch.mean(torch.log(1 + torch.exp(-y * (X @ w))))

def nllsq(w, X, y):
    return torch.mean( ( y - (1/(1 + torch.exp(-X @ w ))) )**2 )

def rademacher_old(weights):
    return torch.round(torch.rand_like(weights)) * 2 - 1

def diag_estimate_old(weights, grad, iters):
    Ds = []
    for j in range(iters):
        z = rademacher_old(weights)
        with torch.no_grad():
            hvp = torch.autograd.grad(grad, weights, grad_outputs=z, retain_graph=True)[0]
        Ds.append((hvp*z))

    return torch.mean(torch.stack(Ds), 0)

def citardouq_solve(a, b, c):
    det = b * b - 4 * a * c
    if det < 1e-40:
        x1 = 0.0
        x2 = 0.0
    else:
        x1 = (2 * c) / (-b - np.sqrt(det))
        x2 = (2 * c) / (-b + np.sqrt(det))
    return np.asarray([x1, x2])

In [4]:
# parameters for D
alpha = 1e-4
beta=0.999
# parameters for slack
lmd = 0.01
mu = 0.1

In [5]:
optimizers_dict = {
    "psps": PSPS
}


def train(seed, loss, train_data, train_target, batch_size, EPOCHS, optimizer_class, **optimizer_kwargs):
    
    torch.random.manual_seed(seed)

    params = torch.zeros(train_data.shape[1], device=device).requires_grad_()

    train_load = data_utils.TensorDataset(train_data, train_target)
    train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)

    criterion = loss(params)
    optimizer = optimizer_class([params], **optimizer_kwargs)

    train_loss = criterion(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(train_loss, params)
    grad_norm_sq = torch.linalg.norm(g).item() ** 2
    slack = 0
    step_size = 0


    hist = [[train_loss.item(), grad_norm_sq, slack, step_size]]
   
    for epoch in range(EPOCHS):
        for i, (batch_data, batch_target) in enumerate(train_dataloader):  
            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            optimizer.zero_grad()
            
            def closure():
                return criterion(batch_data, batch_target)
            
            loss = closure()    
            optimizer.step(closure) 
            slack = optimizer.replay_buffer[-1]["slack"]
            step_size = optimizer.replay_buffer[-1]["step_size"]

        train_loss = criterion(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(train_loss, params)
        grad_norm_sq = torch.linalg.norm(g).item() ** 2

        hist.append([train_loss.item(), grad_norm_sq, slack, step_size])

        if epoch % 100 == 0:
            print(f"[{epoch}] / [{EPOCHS}] | Loss: {train_loss.item()} | GradNorm^2: {grad_norm_sq} | s:{slack} | step_size: {step_size}")

    return hist

In [6]:
def train_psps2(seed, train_data, train_target, train_dataloader, loss_class, EPOCHS, update_precond, constr_precond, slack_method):
    
    torch.random.manual_seed(seed)

    # parameters
    alpha = 1e-4
    beta=0.999
    w = torch.rand(train_data.shape[1], device=device).requires_grad_()

    loss_function = loss_class(w)

    # slack
    s = torch.tensor(0.0)
    delta = mu + lmd

    # save loss and grad size to history
    hist = []
    loss = loss_function(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(loss, w, create_graph=True)
    f_grad = g.clone().detach()
    print(f"Loss: {loss.item()} | GradNorm^2: {(torch.linalg.norm(g) ** 2 ).item()} | s: {s.item()}")

    lmd_star = torch.tensor(0.0)
    hist.append([loss.item(), (torch.linalg.norm(g) ** 2).item(), s.item(), lmd_star.item()])

    assert constr_precond != "none", "Constraint preconditioner cannot be Identity"

    # preconditioninig matrix
    Dk = diag_estimate_old(w, g, 100)
    MAX_ITER = 100


    for epoch in range(EPOCHS):

        for i, (batch_data, batch_target) in enumerate(train_dataloader):

            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            loss = loss_function(batch_data, batch_target)
            g, = torch.autograd.grad(loss, w, create_graph=True)
            f_grad = g.clone().detach()

            s_nil = s - lmd/(2*mu)
            t = loss.item() - s_nil

            if update_precond == "hutch":
                vk = diag_estimate_old(w, g, 1)
                # Smoothing and Truncation 
                Dk = beta * Dk + (1 - beta) * vk
                Dk_hat = torch.abs(Dk)
                Dk_hat[Dk_hat < alpha] = alpha
                Dk_hat_inv = 1 / Dk_hat 

                if constr_precond == "hutch":
                    gnorm_sq = (f_grad * Dk_hat_inv).dot(f_grad)
                    a = torch.dot(f_grad, Dk_hat_inv*f_grad).cpu().detach().numpy()

                    if slack_method == "none":
                        det = 1 - (2 * loss.item() / gnorm_sq )
                        if det <= 1e-40:
                            continue
                        else:
                            t = torch.sqrt(det)/det
                            root1 = -1 + t
                            root2 = -1 - t
                            lmd_star = torch.maximum(root1, root2)
                    elif slack_method == "L1":
                        AA = 1
                        BB = 2 + mu * a - 2 * mu * t
                        CC = 1 + 2 * mu * a - 4 * mu * t
                        DD = -2 * mu * t

                        roots = solve(AA, BB, CC, DD)
                        try:
                            roots = torch.from_numpy(roots)      
                        except TypeError as err:
                            print(roots, err)
                            continue
                        lmd_star = torch.relu(torch.max(roots))

                        s = torch.relu(s - ((lmd - lmd_star)/2 * mu)).item()
                    elif slack_method == "L2":
                        AA = 1
                        BB = 2 + delta * a - 2 * delta * t
                        CC = 1 + 2 * delta * a - 4 * delta * t
                        DD = -2 * delta * t

                        roots = solve(AA, BB, CC, DD)
                        try:
                            roots = torch.from_numpy(roots)      
                        except TypeError as err:
                            print(roots, err)
                            continue
                        lmd_star = torch.relu(torch.max(roots))

                        s = (1/delta) * (mu * s + (lmd_star/2)).item()

                    precond = lmd_star/(1 + lmd_star) * Dk_hat_inv    
                    with torch.no_grad():
                        w.sub_(precond  * f_grad)

                        

                elif constr_precond == "sr1":
                    sk = f_grad.clone()
                    yk = torch.autograd.grad(g, w, grad_outputs=sk, retain_graph=True)[0]
                    gnorm = (g * Dk_hat_inv).dot(g)

                    if gnorm < 1e-25:
                        continue

                    D_inv = torch.diagflat(Dk_hat_inv.clone().detach())

                    D_inv_B = (((Dk_hat_inv * yk).reshape(-1, 1) @ yk.reshape(1, -1)) / (yk.dot(sk)))
                    D_inv_B_D_inv = D_inv_B * Dk_hat_inv

                    a_torch = torch.dot(f_grad, Dk_hat_inv*f_grad)
                    a = a_torch.cpu().detach().numpy()

                    b_torch = torch.dot(f_grad, D_inv_B_D_inv@f_grad)
                    b = b_torch.cpu().detach().numpy() 

                    c_torch = torch.trace(D_inv_B)
                    c = c_torch.cpu().detach().numpy()

                    if slack_method == "L1":
                        d = a + (1 / 2 * mu)
                    elif slack_method == "L2":
                        d = a + 1/(2 * delta)

                    if slack_method == "none":

                        AA = 2 * c**2 * ( torch.dot(f_grad, (D_inv - (1/c) * D_inv_B_D_inv ) @ f_grad) )
                        BB = c_torch * ( torch.dot(f_grad, (4 * D_inv - (3/c) * D_inv_B_D_inv) @ f_grad) )
                        CC = 2 * a - 4 * c * loss.item()
                        DD = - 2 * loss.item()

                        BB = BB / AA
                        CC = CC / AA
                        DD = DD / AA
                        AA = 1.0
                        
                        def lagr(lmd):
                            b = lambda lmd: D_inv - (lmd * (D_inv_B_D_inv) / (1 + lmd * c_torch)) 
                            return lmd * loss -  (1/2)*lmd**2 * torch.dot(f_grad, b(lmd)@f_grad)

                        lmd_star_old = lmd_star
                        lmds = solve(AA, BB, CC, DD)
                        try:
                            lmds = torch.from_numpy(lmds).to(device)
                        except TypeError as err:
                            print(lmds, err)
                            continue
                        lmd_max = torch.max(lmds)
                        lmd_min = torch.maximum(torch.min(lmds), torch.tensor(0))
                        lmd_star = lmd_max
                        if lagr(lmd_max) < lagr(lmd_min):
                            lmd_star = lmd_min

                        if lmd_star > 1e8:
                            print(f"lmd_star is: {lmd_star}")
                            lmd_star = lmd_star_old
                        
                        precond = lmd_star * ( D_inv - (lmd_star * (D_inv_B_D_inv) / (1 + lmd_star * c_torch)) )

                        with torch.no_grad():
                            w.sub_(precond @ f_grad)

                    else:
                        # AA = 2 * d * c**2 - 2 * b * c
                        # BB = 4 * d * c - 3 * b - 2 * t * c**2
                        AA = 2 * c_torch * ((c/2*mu) + torch.dot(f_grad, (c_torch * D_inv - D_inv_B_D_inv) @ f_grad))
                        BB = ( 2 * c_torch * (1 - t * c * mu) / mu ) + torch.dot(f_grad, (4 * c_torch * D_inv - 3 * D_inv_B_D_inv) @ f_grad) 
                        CC = 2 * d - 4 * t * c
                        DD = -2 * t

                        BB = BB / AA
                        CC = CC / AA
                        DD = DD / AA
                        AA = torch.tensor(1.0)

                        lmds = solve(AA, BB, CC, DD)
                        try:
                            lmds = torch.from_numpy(lmds).to(device)
                        except TypeError as err:
                            print(lmds, err)
                            continue
                        lmd_max = torch.max(lmds)
                        lmd_min = torch.maximum(torch.min(lmds), torch.tensor(0))
                        lmd_star = lmd_max

                        if lmd_star > 1e6:
                            print(f"lmd_star is: {lmd_star}")
                            pass
                        else:
                            precond = lmd_star * ( D_inv - (lmd_star * (D_inv_B_D_inv) / (1 + lmd_star * c_torch)) )
                            if slack_method == "L1":
                                s = torch.maximum(torch.tensor(0.0), s - 1/(2*mu)*(lmd - lmd_star)).item()
                            elif slack_method == "L2":
                                s = (1/delta * (mu * s + lmd_star/2)).item()

                        with torch.no_grad():
                            w.sub_(precond  @ f_grad)


            if update_precond == "sr1":

                if constr_precond == "hutch":

                    vk = diag_estimate_old(w, g, 1)
                    # Smoothing and Truncation 
                    Dk = beta * Dk + (1 - beta) * vk
                    Dk_hat = torch.abs(Dk)
                    Dk_hat[Dk_hat < alpha] = alpha
                    Dk_hat_inv = 1 / Dk_hat

                    sk = f_grad.clone()
                    yk = torch.autograd.grad(g, w, grad_outputs=sk, retain_graph=True)[0]
                    gnorm_sq = (g * Dk_hat_inv).dot(g)

                    D_inv = torch.diagflat(Dk_hat_inv.clone().detach())

                    D_inv_B = (((Dk_hat_inv * yk).reshape(-1, 1) @ yk.reshape(1, -1)) / (yk.dot(sk)))
                    D_inv_B_D_inv = D_inv_B * Dk_hat_inv

                    a_torch = torch.dot(f_grad, Dk_hat_inv*f_grad)
                    a = a_torch.cpu().detach().numpy()

                    b_torch = torch.dot(f_grad, D_inv_B_D_inv@f_grad)
                    b = b_torch.cpu().detach().numpy() 

                    c_torch = torch.trace(D_inv_B)
                    c = c_torch.cpu().detach().numpy()
                    
                    d = torch.dot(f_grad, (D_inv_B @ D_inv_B_D_inv) @ f_grad)
                    
                    det = d / (a - 2 * loss.item())
                    if det < 0.0:
                        print(det)
                        continue

                    root_1 = -c + torch.sqrt(det)
                    root_2 = -c - torch.sqrt(det)
                    lmd_star = torch.maximum(torch.tensor(0.0), torch.maximum(root_1, root_2))
                    
                    if lmd_star > 1e8:
                        print(f"lmd_star is: {lmd_star}")
                        continue

                    precond = D_inv - (D_inv_B_D_inv / (lmd_star + c_torch))

                    with torch.no_grad():
                        w.sub_(precond @ f_grad)

                # CG is here
                elif constr_precond == "sr1":
                    hgp = torch.zeros_like(w) # s = H_inv * grad
                    r = f_grad.clone()
                    p = r.detach().clone()

                    for cg_step in range(MAX_ITER):
                        hvp = torch.autograd.grad(g, w, grad_outputs=p, retain_graph=True)[0]
                        alpha_k = torch.dot(r, r) / torch.dot(p, hvp)
                        hgp = hgp + alpha_k * p
                        r_prev = r.clone()
                        r = r - alpha_k * hvp
                        if torch.norm(r) < 1e-10:
                            Ax = torch.autograd.grad(g, w, grad_outputs=hgp, retain_graph=True)[0]    
                            diff = torch.norm(Ax - f_grad)
                            print(f"Took {cg_step} to reach diff={diff}")
                            break

                        beta_k = torch.dot(r, r) / torch.dot(r_prev, r_prev)
                        p = r + beta_k * p
                    
                    gnorm_sq = torch.dot(f_grad, hgp)
                    if slack_method == "none":
                        det = (-gnorm_sq) / (2 * loss.item() - gnorm_sq)
                        if det < 0.0:
                            print(f"det: {det}")
                            continue
                        root_1 = -1 + torch.sqrt(det)
                        root_2 = -1 - torch.sqrt(det)
                        lmd_star = torch.maximum(torch.tensor(0.0), torch.maximum(root_1, root_2))
                        step = lmd_star / (1 + lmd_star) * hgp
                        with torch.no_grad():
                            w.sub_(step)
                    else:
                        if slack_method == "L1":
                            AA = 1
                            BB = 2 + mu * gnorm_sq - 2 * mu * t
                            CC = 1 + 2 * mu * gnorm_sq - 4 * mu * t
                            DD = -2 * mu * t
                            
                            roots = solve(AA, BB, CC, DD)
                            try:
                                roots = torch.from_numpy(roots)      
                            except TypeError as err:
                                print(roots, err)
                                continue
                            lmd_star = torch.relu(torch.max(roots))

                        elif slack_method == "L2":
                            AA = 1
                            BB = 2 + delta * gnorm_sq - 2 * delta * t
                            CC = 1 + 2 * delta * gnorm_sq - 4 * delta * t
                            DD = -2 * delta * t

                            roots = solve(AA, BB, CC, DD)
                            try:
                                roots = torch.from_numpy(roots)      
                            except TypeError as err:
                                print(roots, err)    
                                continue
                            lmd_star = torch.relu(torch.max(roots))

                        if lmd_star > 1e2:
                            print(f"lmd_star is: {lmd_star}")
                            continue
                    
                        if slack_method == "L1":
                            s = torch.relu(s - ((lmd - lmd_star)/2 * mu)).item()
                        elif slack_method == "L2":
                            s = (1/delta) * (mu * s + (lmd_star/2)).item()
                        precond = lmd_star/(1 + lmd_star) * hgp
                            
                        with torch.no_grad():
                            w.sub_(precond)


        loss = loss_function(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(loss, w, create_graph=True)
        if epoch % 100 == 0:
            print(f"[{epoch}/{EPOCHS}] | Loss: {loss.item()} | GradNorm^2: {(torch.linalg.norm(g) ** 2 ).item()} | s: {s}")
        hist.append([loss.item(), (torch.linalg.norm(g) ** 2).item(), s, lmd_star.item()])

                

    return hist





In [7]:
def train_sp2plus(seed, train_data, train_target, train_dataloader, loss_class, EPOCHS, slack_method):
    torch.random.manual_seed(seed)

    # w = torch.zeros(train_data.shape[1], device=device).requires_grad_()
    w = torch.rand(train_data.shape[1], device=device).requires_grad_()
    s = torch.tensor(0.0)


    loss_function = loss_class(w)
    # save loss and grad size to history
    hist = []
    loss = loss_function(train_data.to(device), train_target.to(device))
    g, = torch.autograd.grad(loss, w, create_graph=True)
    print(f"Loss: {loss.item()} | GradNorm^2: {(torch.linalg.norm(g) ** 2 ).item()} | s: {s.item()}")
    hist.append([loss.item(), (torch.linalg.norm(g) ** 2).item(), s.item()])
        
    for epoch in range(EPOCHS):

        for i, (batch_data, batch_target) in enumerate(train_dataloader):

            batch_data = batch_data.to(device)
            batch_target = batch_target.to(device)

            loss = loss_function(batch_data, batch_target)
            g, = torch.autograd.grad(loss, w, create_graph=True)
            f_grad = g.clone().detach()
            gnorm_sq = torch.norm(f_grad)**2

            loss_closure = lambda w: loss_function(batch_data, batch_target)
            hgp = torch.autograd.functional.hvp(loss_closure, w, g, create_graph=True)[1]

            if slack_method == "none":
                with torch.no_grad():
                    sps_step = loss.item() / gnorm_sq
                    w.sub_(sps_step * f_grad)
                    gdiffHgp = torch.sub(f_grad, hgp, alpha=sps_step)
                    if torch.norm(gdiffHgp)**2 > 1e-10:
                            w.sub_(0.5 * (sps_step**2) * gdiffHgp * torch.dot(f_grad, gdiffHgp)/ (torch.norm(gdiffHgp)**2))


            elif slack_method == "L1":
                Gamma3 = torch.relu(loss.item() - (s - (lmd / (2 * (1 - lmd) ))))
                Gamma3 /= 1 + gnorm_sq

                Gamma4 = torch.minimum(Gamma3, loss.item() / gnorm_sq)

                Lambda1 = loss.item() - Gamma4 * gnorm_sq + 0.5 * Gamma4 * Gamma4 * torch.dot(hgp, f_grad)

                Gamma5 = torch.relu(Lambda1 - (s - lmd / (2 * (1 - lmd))))
                t_norm_sq = 1 + torch.norm( f_grad - Gamma4 * hgp  ) ** 2
                Gamma5 /= t_norm_sq

                Gamma6 = torch.minimum(Gamma5, Lambda1 / t_norm_sq )
                sps_step = (Gamma4 + Gamma6)
                ps2_step = Gamma6 * Gamma4
                with torch.no_grad():                
                    w.sub_(sps_step * f_grad)
                    if torch.norm(ps2_step)**2 > 1e-10:
                            w.add_(ps2_step * hgp)

                s = torch.relu( torch.relu(s - (lmd / (2 * (1 - lmd))) + Gamma3) - (lmd / (2 * (1 - lmd))) + Gamma5)

            elif slack_method == "L2":
                Gamma1 = torch.relu( loss.item() - (1 - lmd) * s ) / (1 - lmd + gnorm_sq)
                t1 = loss.item() - Gamma1 * gnorm_sq - (1 - lmd)**2 * (s + Gamma1) + 0.5 * Gamma1 * Gamma1 * torch.dot(hgp, f_grad)
                t2 = 1 - lmd + torch.norm(f_grad - Gamma1 * hgp)**2
                Gamma2 = torch.relu(t1 / t2)
                sps_step = Gamma1 + Gamma2
                ps2_step = Gamma2 * Gamma1
                
                with torch.no_grad():                
                    w.sub_(sps_step * f_grad)
                    if torch.norm(ps2_step)**2 > 1e-10:
                        w.add_(ps2_step * hgp)


                s = (1 - lmd) * ((1 - lmd) * (s + Gamma1) + Gamma2)


        loss = loss_function(train_data.to(device), train_target.to(device))
        g, = torch.autograd.grad(loss, w, create_graph=True)
        if epoch % 100 == 0:
            print(f"[{epoch} / {EPOCHS}] | Loss: {loss.item()} | GradNorm^2: {(torch.linalg.norm(g) ** 2 ).item()} | s: {s.item()}")

        hist.append([loss.item(), (torch.linalg.norm(g) ** 2).item(), s.item()])

    return hist




In [8]:
def main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, 
        slack_method, lmd, mu, seed, save):
    
 
    torch.random.manual_seed(seed)
    
    # Setup
    uninterpolatible = False

    # training 
    EPOCHS = 500
    loss_class = get_loss(loss_name)
    train_data, train_target = get_dataset(dataset_name, batch_size, percentage, scale_k)

    X = train_data
    Y = train_target
    if uninterpolatible:
        X = torch.cat((train_data, train_data, train_data))
        Y = torch.cat((train_target, train_target, -train_target))
        
    train_data = X.to(torch.get_default_dtype())
    train_target = Y.to(torch.get_default_dtype())
    train_load = data_utils.TensorDataset(train_data, train_target)
    train_dataloader = DataLoader(train_load, batch_size=batch_size, shuffle=True)
    

    if optimizer_class == "psps":
        result = train(
            seed,
            loss_class,
            train_data, 
            train_target, 
            batch_size,
            EPOCHS,
            PSPS,
            preconditioner=preconditioner,
            slack_method=slack_method,
            lmd=lmd,
            mu=mu
        ) 
    elif optimizer_class.split("_")[0] == "psps2":
        precond = optimizer_class.split("_")[1]
        if precond == "DD":
            update_precond, constr_precond = "hutch", "hutch"
        elif precond == "BB":
            update_precond, constr_precond = "sr1", "sr1"
        elif precond == "DB":
            update_precond, constr_precond = "hutch", "sr1"

        result = train_psps2(seed, train_data, train_target, train_dataloader, loss_class, EPOCHS, update_precond, 
                                   constr_precond, slack_method)
        
    elif optimizer_class == "sp2plus":
        result = train_sp2plus(seed, train_data, train_target, train_dataloader, loss_class, EPOCHS, slack_method)


    if save:
        results_path = os.getenv("RESULTS_DIR")
        directory = f"{results_path}/{dataset_name}/percent_{percentage}/scale_{scale_k}/bs_{batch_size}" \
        f"/epochs_{EPOCHS}/{loss_name}/{optimizer_class}/lr_{lr}/precond_{preconditioner}/slack_{slack_method}/lmd_{lmd}/mu_{mu}/seed_{seed}"
        print(directory)
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        torch.save([x[0] for x in result], f"{directory}/loss")
        torch.save([x[1] for x in result], f"{directory}/grad_norm_sq")
        
        if optimizer_class in ("psps", "psps2", "sp2plus"):
            torch.save([x[2] for x in result], f"{directory}/slack")

In [9]:
optimizer_class = "psps"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "logreg"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4, 5]:
        for preconditioner in ["none", "hutch"]:
            for slack_method in ["none", "L1", "L2"]:
                print(seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

0 none none
/home/farshed.abdukhakimov/projects/sps2/datasets
[0] / [500] | Loss: 0.27302133996016886 | GradNorm^2: 2.061224676812145 | s:0.0 | step_size: 0.008249932565568118
[100] / [500] | Loss: 2.9471436508090984e-06 | GradNorm^2: 1.6262220019880394e-08 | s:0.0 | step_size: 269.3821383141328
[200] / [500] | Loss: 6.997347245184873e-09 | GradNorm^2: 2.541839157708763e-15 | s:0.0 | step_size: 97280.0146994915
[300] / [500] | Loss: 6.222438705005202e-09 | GradNorm^2: 1.1600618949455627e-15 | s:0.0 | step_size: 118113.90484568503
[400] / [500] | Loss: 6.222438705005202e-09 | GradNorm^2: 1.1600618949455627e-15 | s:0.0 | step_size: 118113.90484568503
/home/farshed.abdukhakimov/projects/sps2/results/colon-cancer/percent_1.0/scale_0/bs_8/epochs_500/logreg/psps/lr_0.1/precond_none/slack_none/lmd_0.01/mu_0.1/seed_0
0 none L1
/home/farshed.abdukhakimov/projects/sps2/datasets
[0] / [500] | Loss: 0.2621804899927087 | GradNorm^2: 1.480419336953697 | s:0.02331556237957834 | step_size: 0.013135793

  torch.tensor(s - self.lmd + step_size_temp))
  torch.tensor(0.0), torch.tensor(loss - s + self.lmd) ) / (1 + gnorm_square)


[100] / [500] | Loss: 0.006551389441575352 | GradNorm^2: 0.0007823727236634557 | s:0.00641627673220799 | step_size: 0.011937592821946029
[200] / [500] | Loss: 0.0034394359290412005 | GradNorm^2: 0.00019484767348011141 | s:0.0036247687456241132 | step_size: 0.010892360169631451
[300] / [500] | Loss: 0.002356005668210686 | GradNorm^2: 9.071120721354876e-05 | s:0.0014641010524859693 | step_size: 0.00838493491544598
[400] / [500] | Loss: 0.0017972884983108199 | GradNorm^2: 5.199870994275943e-05 | s:0.0005925268529583866 | step_size: 0.00761544796547419
/home/farshed.abdukhakimov/projects/sps2/results/colon-cancer/percent_1.0/scale_0/bs_8/epochs_500/logreg/psps/lr_0.1/precond_none/slack_L1/lmd_0.01/mu_0.1/seed_0
0 none L2
/home/farshed.abdukhakimov/projects/sps2/datasets
[0] / [500] | Loss: 0.28224632881745804 | GradNorm^2: 0.9996666076072585 | s:0.07074917477964138 | step_size: 0.009130802025244016


  group["step_size"] = torch.max(torch.tensor(0.0), torch.tensor(loss - self.lmd_hat * s)) / (gnorm_square + self.lmd_hat)


[100] / [500] | Loss: 0.05072972399347931 | GradNorm^2: 0.07666423804679856 | s:0.06944133080815057 | step_size: 5.3614413831561926e-05
[200] / [500] | Loss: 0.03472656249152889 | GradNorm^2: 0.034771140681258805 | s:0.04625667253857985 | step_size: 0.0
[300] / [500] | Loss: 0.027734084431663336 | GradNorm^2: 0.024353334598783753 | s:0.04191407496158966 | step_size: 0.0
[400] / [500] | Loss: 0.023643384121264037 | GradNorm^2: 0.017436056639133758 | s:0.03031179292909648 | step_size: 0.0
/home/farshed.abdukhakimov/projects/sps2/results/colon-cancer/percent_1.0/scale_0/bs_8/epochs_500/logreg/psps/lr_0.1/precond_none/slack_L2/lmd_0.01/mu_0.1/seed_0
0 hutch none
/home/farshed.abdukhakimov/projects/sps2/datasets
[0] / [500] | Loss: 0.27656652976973345 | GradNorm^2: 1.5287028354932282 | s:0.0 | step_size: 0.001379008310932659
[100] / [500] | Loss: 0.00027095845328029694 | GradNorm^2: 0.00012818411168323231 | s:0.0 | step_size: 0.010252229417600147
[200] / [500] | Loss: 2.552335277753336e-09 

In [10]:
optimizer_class = "psps"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "nllsq"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4, 5]:
        for preconditioner in ["none", "hutch"]:
            for slack_method in ["none", "L1", "L2"]:
                print(seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

0 none none
/home/farshed.abdukhakimov/projects/sps2/datasets
[0] / [500] | Loss: 1.1055083357768114 | GradNorm^2: 1.942117387766138 | s:0.0 | step_size: 0.2724461031133273
[100] / [500] | Loss: nan | GradNorm^2: nan | s:0.0 | step_size: nan
[200] / [500] | Loss: nan | GradNorm^2: nan | s:0.0 | step_size: nan
[300] / [500] | Loss: nan | GradNorm^2: nan | s:0.0 | step_size: nan
[400] / [500] | Loss: nan | GradNorm^2: nan | s:0.0 | step_size: nan
/home/farshed.abdukhakimov/projects/sps2/results/colon-cancer/percent_1.0/scale_0/bs_8/epochs_500/nllsq/psps/lr_0.1/precond_none/slack_none/lmd_0.01/mu_0.1/seed_0
0 none L1
/home/farshed.abdukhakimov/projects/sps2/datasets
[0] / [500] | Loss: 1.1100206945087774 | GradNorm^2: 1.3595584469320934 | s:1.0345021923187574 | step_size: 0.48825641524674496
[100] / [500] | Loss: 0.7939912433495593 | GradNorm^2: 0.012196825747392824 | s:1.839177682377566 | step_size: 0.0
[200] / [500] | Loss: 0.7911131939682673 | GradNorm^2: 0.00013464518793721743 | s:1.3

In [11]:
optimizer_class = "sp2plus"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "logreg"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

0 none none
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 12.018237610142496 | GradNorm^2: 37.89289270558744 | s: 0.0
[0 / 500] | Loss: 3.2960787582015394 | GradNorm^2: 8.0912958835967 | s: 0.0
[100 / 500] | Loss: 0.0 | GradNorm^2: 8.044777759214729e-33 | s: 0.0
[200 / 500] | Loss: 0.0 | GradNorm^2: 8.044777759214729e-33 | s: 0.0
[300 / 500] | Loss: 0.0 | GradNorm^2: 8.044777759214729e-33 | s: 0.0
[400 / 500] | Loss: 0.0 | GradNorm^2: 8.044777759214729e-33 | s: 0.0
/home/farshed.abdukhakimov/projects/sps2/results/colon-cancer/percent_1.0/scale_0/bs_8/epochs_500/logreg/sp2plus/lr_0.1/precond_none/slack_none/lmd_0.01/mu_0.1/seed_0
0 none L1
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 12.018237610142497 | GradNorm^2: 37.892892705587435 | s: 0.0
[0 / 500] | Loss: 2.3753639630776444 | GradNorm^2: 7.132999212742506 | s: 0.6048532806460634
[100 / 500] | Loss: 0.003971966653926006 | GradNorm^2: 0.0004984449768667383 | s: 0.0
[200 / 500] | Loss: 0.002302055907082842 | Grad

In [12]:
optimizer_class = "sp2plus"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "nllsq"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(scale_k, seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

0 0 none none
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 1.9726952421658364 | GradNorm^2: 0.4760072523691324 | s: 0.0
[0 / 500] | Loss: nan | GradNorm^2: nan | s: 0.0
[100 / 500] | Loss: nan | GradNorm^2: nan | s: 0.0
[200 / 500] | Loss: nan | GradNorm^2: nan | s: 0.0
[300 / 500] | Loss: nan | GradNorm^2: nan | s: 0.0
[400 / 500] | Loss: nan | GradNorm^2: nan | s: 0.0
/home/farshed.abdukhakimov/projects/sps2/results/colon-cancer/percent_1.0/scale_0/bs_8/epochs_500/nllsq/sp2plus/lr_0.1/precond_none/slack_none/lmd_0.01/mu_0.1/seed_0
0 0 none L1
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 1.9726952421658364 | GradNorm^2: 0.4760072523691324 | s: 0.0
[0 / 500] | Loss: 2.109682052176463 | GradNorm^2: 0.0075151552847513995 | s: 3.886177858647136
[100 / 500] | Loss: 2.0724683431395174 | GradNorm^2: 0.5680500333486196 | s: 3.132753184852634
[200 / 500] | Loss: 1.8074153850380135 | GradNorm^2: 0.003293504759939361 | s: 2.765367162299196
[300 / 500] | Loss: 1.750435843670

In [13]:
optimizer_class = "psps2_DD"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "logreg"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(scale_k, seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

0 0 none none
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 12.018237610142497 | GradNorm^2: 37.89289270558746 | s: 0.0
[0/500] | Loss: 2.8001893992359714 | GradNorm^2: 7.741772814670385 | s: 0.0
[100/500] | Loss: 4.8228854278128245e-06 | GradNorm^2: 9.244134736949503e-10 | s: 0.0
[200/500] | Loss: 2.7051361983961043e-06 | GradNorm^2: 7.376023347088342e-10 | s: 0.0
[300/500] | Loss: 1.3460916473452693e-06 | GradNorm^2: 7.604764072817095e-11 | s: 0.0
[400/500] | Loss: 7.944794091741808e-07 | GradNorm^2: 4.9873152141434244e-11 | s: 0.0
/home/farshed.abdukhakimov/projects/sps2/results/colon-cancer/percent_1.0/scale_0/bs_8/epochs_500/logreg/psps2_DD/lr_0.1/precond_none/slack_none/lmd_0.01/mu_0.1/seed_0
0 0 none L1
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 12.018237610142497 | GradNorm^2: 37.89289270558744 | s: 0.0
[0/500] | Loss: 3.6436882157368258 | GradNorm^2: 9.9606259595437 | s: 0.0
[100/500] | Loss: 2.5081969909768158e-05 | GradNorm^2: 1.4749495302353469e-08 | 

TypeError: expected np.ndarray (got NoneType)

In [None]:
optimizer_class = "psps2_DD"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "nllsq"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(scale_k, seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

0 0 none
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 0.5180695024209936 | GradNorm^2: 1.1608098403691494e-07 | s: 0.0
[0/500] | Loss: 0.5180695024209936 | GradNorm^2: 1.1608098403691494e-07 | s: 0.0
[100/500] | Loss: 0.5180695024209936 | GradNorm^2: 1.1608098403691494e-07 | s: 0.0
[200/500] | Loss: 0.5180695024209936 | GradNorm^2: 1.1608098403691494e-07 | s: 0.0
[300/500] | Loss: 0.5180695024209936 | GradNorm^2: 1.1608098403691494e-07 | s: 0.0
[400/500] | Loss: 0.5180695024209936 | GradNorm^2: 1.1608098403691494e-07 | s: 0.0
/home/farshed.abdukhakimov/projects/sps2/results/mushrooms/percent_1.0/scale_0/bs_64/epochs_500/nllsq/psps2_DD/lr_0.1/precond_none/slack_none/lmd_0.01/mu_0.1/seed_0
0 0 L1
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 0.5180695024209936 | GradNorm^2: 1.1608098403691491e-07 | s: 0.0
[0/500] | Loss: 0.5179725438159356 | GradNorm^2: 1.3649229142632996e-11 | s: 0.3750210279161014
[100/500] | Loss: 0.5179715036241304 | GradNorm^2: 4.044630918227172

In [None]:
optimizer_class = "psps2_BB"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "logreg"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(scale_k, seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

0 0 none
/home/farshed.abdukhakimov/projects/sps2/datasets
Loss: 6.057553132841753e-05 | GradNorm^2: 4.763586415281014e-08 | s: 0.0
Took 84 to reach diff=7.82389588028807e-11
det: -1.0003663872437585
Took 49 to reach diff=8.787699908325502e-11
det: -1.0002364043778726
Took 69 to reach diff=4.9775246885210425e-11
det: -1.000250118069328
Took 80 to reach diff=9.522458385841695e-11
det: -1.0002023266973978
det: -1.0001355197603838
Took 68 to reach diff=7.527209129898138e-11
det: -1.0002883266916462
Took 88 to reach diff=9.465156791417998e-11
det: -1.0002543060694593
Took 58 to reach diff=5.5926401161523853e-11
det: -1.0003566506644892
Took 77 to reach diff=6.963256151868026e-11
det: -1.000269874396387
det: -1.000234975672748
det: -1.0003046230026837
Took 71 to reach diff=7.437872233117948e-11
det: -1.0003870606064411
Took 94 to reach diff=8.669560026858463e-11
det: -1.0001867978327288
Took 75 to reach diff=5.687924928982815e-11
det: -1.0001816411290376
det: -1.0002179632569264
Took 75 to 

KeyboardInterrupt: 

In [None]:
optimizer_class = "psps2_BB"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "nllsq"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(scale_k, seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

In [None]:
optimizer_class = "psps2_DB"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "logreg"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(scale_k, seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)

In [None]:
optimizer_class = "psps2_DB"
batch_size = 8
dataset_name = "colon-cancer"
percentage = 1.0
loss_name = "nllsq"

lr = 0.1
save = True

for scale_k in [0, 3, 5]:
    for seed in [0, 1, 2, 3, 4]:
        for preconditioner in ["none"]:
            for slack_method in ["none", "L1", "L2"]:
                print(scale_k, seed, preconditioner, slack_method)
                main(dataset_name, batch_size, percentage, scale_k, loss_name, optimizer_class, lr, preconditioner, slack_method, lmd, mu, seed, save)