In [6]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import pandas as pd

# I'm assuming samples want to maximize 0-score

In [2]:
def softmax(v, temp=0.02):
    sm = nn.Sigmoid()
    v = (v + 1) / 2
    return sm(v / temp) * 2 - 1 # bc -1 1 instead of 0 1

def f(X, theta):
    return X @ theta.T

def R(X, y, theta):
    return torch.maximum(1 - f(X, theta) * y.reshape(-1, 1), torch.zeros(X.shape[0], theta.shape[0]))

def alpha_reg(alpha):
    one_m = torch.ones(alpha.shape[1])
    # return 0.5 * torch.trace(alpha @ alpha.T) # - torch.sum(torch.minimum(torch.zeros(4, 2), alpha) * 1000)
    return 0.5 * (alpha @ one_m).T @ (alpha @ one_m)

def alpha_loss(X, theta, alpha):
    return - torch.trace(alpha @ f(X, theta).T) + alpha_reg(alpha)

def theta_reg(theta):
    return torch.trace(theta @ theta.T)

def memory(alphas, p=0.5):
    v = []
    m = []
    for i, a in enumerate(alphas):
        m.append(p ** (len(alphas) - i - 1))
        v.append(m[-1] * a)
    v = torch.sum(torch.stack(v), dim=0) / sum(m)
    return v

def cache_memory(alpha, mem, p=0.5):
    return (alpha + p * mem) / (1 + p)

# def memory(alphas, ip=2):
#     v = []
#     m = []
#     for i, a in enumerate(alphas):
#         m.append(ip ** i)
#         v.append(m[-1] * a)
#     v = torch.sum(torch.stack(v), dim=0) / sum(m)
#     return v

def theta_loss(X, y, alphas, theta, p=0.5, c=1):
    return torch.trace(memory(alphas, p=p) @ R(X, y, theta).T) + c * theta_reg(theta)

def theta_loss_mem(X, y, mem, theta, c=1):
    return torch.trace(mem @ R(X, y, theta).T) + c * theta_reg(theta)

In [3]:
def setup():
    theta = torch.tensor([[1, 0, 0], [0, 1, 0]], dtype=torch.float)
    X = torch.tensor([[1, 1, 1], [1, -1, 1], [-1, 1, 1], [-1, -1, 1]], dtype=torch.float)
    y = torch.tensor([1, -1, -1, -1], dtype=torch.float)
    p = 0
    c = 0
    return { 'theta' : theta,  'X' : X, 'y' : y, 'p' : p, 'c' : c }

def setup_2p():
    theta = torch.tensor([[1, 0, 0], [0, 1, 0]], dtype=torch.float)
    X = torch.tensor([[1, 1, 1], [1, 1, 1], [1, -1, 1], [-1, 1, 1], [-1, -1, 1]], dtype=torch.float)
    y = torch.tensor([1, 1, -1, -1, -1], dtype=torch.float)
    p = 0
    c = 1
    return { 'theta' : theta,  'X' : X, 'y' : y, 'p' : p, 'c' : c }


def setup_3():
    theta = torch.tensor([[-1, 1, 0]], dtype=torch.float)
    X = torch.tensor([[0, 0.5, 1], [1, 0, 1], [-1, 0, 1]], dtype=torch.float)
    y = torch.tensor([1, -1, -1], dtype=torch.float)
    p = 1
    c = 0
    return { 'theta' : theta,  'X' : X, 'y' : y, 'p' : p, 'c' : c }

In [4]:
# Iteratively run non-closed-form optimization
alphas = []
thetas = []
diffs = []
mems = []
alpha_losses = []
theta_losses = []
vals = setup_2p()
theta = vals['theta']
X = vals['X']
y = vals['y']
p = vals['p']
c = vals['c']
old_theta = theta.detach()

for e in (pbar := tqdm(range(20), position=0, leave=True)):
    diffs.append(torch.linalg.norm(old_theta - theta))
    pbar.set_description(f"Diff: {diffs[-1]}")
    alpha = torch.zeros(X.shape[0], theta.shape[0])
    for _ in range(50000):
        alpha = torch.maximum(torch.zeros(X.shape[0], theta.shape[0]), alpha).detach()
        alpha.requires_grad_(True)
        optimizer = torch.optim.Adam([alpha], lr=0.0001)
        optimizer.zero_grad()
        loss = alpha_loss(X, theta, alpha)
        loss.backward()
        optimizer.step()
    alphas.append(torch.maximum(torch.zeros(X.shape[0], theta.shape[0]), alpha).detach())
    alpha_losses.append(alpha_loss(X, theta, alpha).detach())
    if e == 0:
        mem = alpha
    mem = cache_memory(alpha, mem, p=p).detach()
    mems.append(mem)

#     print('ON:', old_theta, theta, torch.linalg.norm(old_theta - theta))
    old_theta = torch.clone(theta.detach())
    theta = torch.ones_like(theta)
    theta.requires_grad_(True)
    optimizer = torch.optim.Adam([theta], lr=0.005)
    for i in range(10000):
#         theta = torch.maximum(torch.ones(2, 3), theta).detach()
#         theta.requires_grad_(True)
#         optimizer = torch.optim.Adam([theta], lr=0.005)
        optimizer.zero_grad()
        loss = theta_loss_mem(X, y, mem, theta, c=c)
        loss.backward()
        optimizer.step()
    thetas.append(theta.detach())
    theta_losses.append(theta_loss(X, y, alphas, theta, p=p, c=c).detach())

ncf_alphas = alphas
ncf_thetas = thetas
ncf_diffs = diffs
ncf_alpha_losses = alpha_losses
ncf_theta_losses = theta_losses

  return 0.5 * (alpha @ one_m).T @ (alpha @ one_m)
Diff: 1.9242945909500122: 100%|████████████████████████████████████████████████████████| 20/20 [12:20<00:00, 37.02s/it]


In [None]:
def theta_grad(X, y, mem, theta, k, c):
    grad = 2 * c * theta[k]
    grad -= sum([mem[i,k] * y[i] * X[i].T for i in range(X.shape[0]) if X[i] @ theta[k].T * y[i] < 1])
    return grad

In [None]:
# Iteratively run explicit gradient step optimization
# Iteratively run closed-form optimization (NO HINGE LOSS)
vals = setup_3()
theta = vals['theta']
X = vals['X']
y = vals['y']
p = vals['p']
c = vals['c']
old_theta = theta.detach()
alphas = []
thetas = []
diffs = []
mems = []
f_x = []
alpha_losses = []
theta_losses = []
        

for e in (pbar := tqdm(range(20), position=0, leave=True)):
    diffs.append(torch.linalg.norm(old_theta - theta))
    pbar.set_description(f"Diff: {diffs[-1]}")
    alpha = torch.maximum(torch.zeros(X.shape[0], theta.shape[0]), f(X, theta))
    for i in range(X.shape[0]):
        if torch.sum(alpha[i]) > 1.5:
            alpha[i] *= 1.5 / torch.sum(alpha[i])
    alpha_losses.append(alpha_loss(X, theta, alpha).detach())
    alphas.append(alpha)
    if e == 0:
        mem = alpha
    print(alpha)

    old_theta = theta.detach().clone()
    theta = theta.detach().clone()
    mem = cache_memory(alpha, mem, p=p)
    mems.append(mem)
    for i in range(10_000):
        lr = 1 / (1 * (i + 1))
        theta_list = [theta_grad(X, y, mem, theta, k) for k in range(theta.shape[0])]
        theta -= lr * torch.stack(theta_list)
    thetas.append(theta.detach())
    theta_losses.append(theta_loss(X, y, alphas, theta, p=p, c=c).detach())
    f_x.append(f(X, theta))
    print(theta)

In [5]:
to_disp = [ncf_diffs, ncf_alphas, ncf_thetas, ncf_alpha_losses, ncf_theta_losses]
for i in range(20):
    # print(f'Time {i}: {[a[i].tolist() for a in to_disp]}')
    print('Time ', i)
    for a in to_disp:
        print(a[i].tolist())
    print('')

Time  0
0.0
[[0.49997058510780334, 0.49997058510780334], [0.49997058510780334, 0.49997058510780334], [1.0000535249710083, 0.0], [0.0, 1.0000535249710083], [0.0, 0.0]]
[[-0.0004241214192006737, 0.968101978302002, -0.00022413194528780878], [0.968101978302002, -0.0004241214192006737, -0.00022413194528780878]]
-2.0006003379821777
2.002035140991211

Time  1
1.9687914848327637
[[0.48377275466918945, 0.48377275466918945], [0.48377275466918945, 0.48377275466918945], [0.0, 0.9682483077049255], [0.9682483077049255, 0.0], [0.0, 0.0]]
[[0.9678969383239746, -0.0004513925814535469, -0.000251405785093084], [-0.0004513925814535469, 0.9678969383239746, -0.000251405785093084]]
-1.8741563558578491
1.9979382753372192

Time  2
1.936874508857727
[[0.483572781085968, 0.483572781085968], [0.483572781085968, 0.483572781085968], [0.9680482745170593, 0.0], [0.0, 0.9680482745170593], [0.0, 0.0]]
[[-0.0005513495998457074, 0.9675968885421753, -0.00035136277438141406], [0.9675968885421753, -0.0005513495998457074, -0

In [None]:
to_disp = [diffs, alphas, thetas, alpha_losses, theta_losses]
for i in range(20):
    # print(f'Time {i}: {[a[i].tolist() for a in to_disp]}')
    print('Time ', i)
    for a in to_disp:
        print(a[i].tolist())
    print('')