In [1]:
import numpy as np
import torch
from hdmm.templates import TemplateStrategy
from hdmm import workload, matrix
import time

In [2]:
class McKennaConvex(TemplateStrategy):
    """
    A highly optimized GPU implementation of OPT_0.  
    Requires memory for 6*n^2 4 byte floats
    
    For n = 2^14 = 16384, requires ~25 seconds per iteration
    """
    def __init__(self, n):
        self.n = n
        #self._mask = torch.tril(torch.ones(n,n, dtype=torch.uint8, device="cuda"), diagonal=-1)
        #self._params = torch.zeros(n*(n-1)//2, device="cuda")
        #self.X = torch.eye(n,n, device="cuda")
        self.iX = torch.empty(n,n,device="cuda")
        self.tmp = torch.empty(n,n,device="cuda")
        self.grad = torch.empty(n,n,device="cuda")
        
    def strategy(self):
        torch.cholesky(self.X, out=self.tmp)
        A = self.tmp.to("cpu").numpy()
        return matrix.EkteloMatrix(A.T)

    def _set_workload(self, W):
        self.V = torch.tensor(W.gram().dense_matrix()).to("cuda")
        self.W = W
        
    def _loss(self, X):
        try:
            torch.cholesky(X, out=self.tmp)
            torch.cholesky_inverse(self.tmp, out=self.iX)
            #self.iX = torch.inverse(X, out=self.iX)
        except:
            return torch.tensor(np.inf, device="cuda")
      
        return torch.dot(self.iX.flatten(), self.V.flatten())

    def _ngrad(self):
        # negative gradient, should be called immediately after _loss
        self.tmp = torch.mm(self.iX, self.V, out=self.tmp)
        self.grad = torch.mm(self.tmp, self.iX, out=self.grad)
        #G = -self.iX @ self.V @ self.iX
        torch.zeros(self.n, out=self.grad.diagonal())
        return self.grad

    def optimize(self, W, iters=5000):
        self._set_workload(W)

        eig, P = torch.symeig(self.V, eigenvectors=True)
        eig[eig < 1e-10] = 0.0
        X = P @ torch.diag(torch.sqrt(eig)) @ torch.t(P)
        X /= torch.diag(X).max()
        torch.ones(self.n, out=X.diagonal())

        Y = torch.empty(self.n, self.n, device="cuda")
        
        # have to implement the optimization loop manually :(
        
        loss= self._loss(X)
        
        beta = 0.25
        for it in range(250):
            beta *= 4.0
            t0 = time.time()
            curr_loss = loss
            ngrad = self._ngrad().mul_(beta)
            m = ngrad.norm()
            t1 = time.time()
            Y.copy_(X)
            for i in range(0, 25):
                torch.add(Y, ngrad, out=X)
                loss = self._loss(X)
                if curr_loss - loss >= 0.5*beta*m:
                    break
                beta *= 0.5
                ngrad.mul_(0.5)
            t2 = time.time()
            if it % 1 == 0:
                print('%d, %.2f, %.2f, %.6f, %.4f' % (it, t1-t0, t2-t1, beta,torch.sqrt(loss/self.W.shape[0])))
            
        self.X = X
            
        return loss
              

In [None]:
n = 2**14
W = workload.Prefix(n, dtype=np.float32)
#W = workload.AllRange(n)
temp = McKennaConvex(n)
temp.optimize(W)

A = temp.strategy().dense_matrix()
np.save('prefix-%d.npy' % n, A)

0, 0.00, 24.75, 0.015625, 4.4598
1, 0.00, 25.52, 0.000122, 4.4593
2, 0.00, 23.69, 0.000488, 4.4574
3, 0.00, 23.82, 0.001953, 4.4502
4, 0.00, 24.05, 0.001953, 4.4433
5, 0.00, 24.21, 0.001953, 4.4367
6, 0.00, 23.98, 0.003906, 4.4242
7, 0.00, 23.88, 0.007812, 4.4016
8, 0.00, 25.13, 0.000122, 4.4010
9, 0.00, 23.87, 0.000488, 4.3998
