# Pytorch Correlsted Gaussian Wave function 

Performance optimizations testing

Optimize a wave function expanded in correlated gaussian basis functions.


## Integral formulas  (Matrix Elements)

### Skl (Overlap), Tkl (Kinetic Energy), Vkl (Potential Energy) Hkl (Hamiltonion)

 **py_matel**: returnes symmetry projected matrix elements in a
 basis of simple correlated gaussians phi_kl = exp[-r'(Lk*Ll' kron I3)r]

 * n:      the number of "psuedo" particles i.e. Center of mass translationl degrees of freedom are removed so n is N-1 if N is the number of "real" particles.
            
 * vechLk: nonlinear exponent parameters n(n+1)/2 x 1
 * vechLl:    These will form the lower triangle matrices Lk and Ll


 * Sym:     symmetry projection matrix for the term being computed
 * Mass:    mass matrix for kinetic energy (reduced masses of particles )
 * vecQ:    charge products for potential energy (elements q_i x q_j where q are particle charges)

In [1]:
import numpy as np
import torch as th

import time
import cProfile

In [65]:
dtype = th.float64

gpuid = 0
#device = th.device("cuda:"+ str(gpuid))
device = th.device("cpu")

print("Execution device: ",device)
print("PyTorch version: ", th.__version__ )
print("CUDA available: ", th.cuda.is_available())
print("CUDA version: ", th.version.cuda)
print("CUDA device:", th.cuda.get_device_name(gpuid))

Execution device:  cpu
PyTorch version:  0.4.1
CUDA available:  True
CUDA version:  9.0.176
CUDA device: TITAN V


In [109]:
# Utility functions

# return the lower triangle of A in column order i.e. vech(A)
def vech(A):
    count = 0
    c = A.shape[0]
    v = th.zeros(c * (c + 1) // 2,)
    for j in range(c):
        for i in range(j,c):
            v[count] = A[i,j]
            count += 1
    return th.tensor(v , device=device, dtype=dtype)

# vech2L   create lower triangular matrix L from vechA
def vech2L(v,n):
    count = 0
    L = th.zeros((n,n))
    for j in range(n):
        for i in range(j,n):
            L[i,j]=v[count]
            count += 1
    return th.tensor(L , device=device, dtype=dtype)

In [145]:
def py_matel(n, Ak, Al, detLk,detLl, Sym, Mass, Qmat):
    

    # build Lk and Ll

    #Lk = vech2L(vechLk,n);
    #Ll = vech2L(vechLl,n);
    
    # apply symmetry projection on Ll
    
    # th.t() is shorthand for th.transpose(X, 0,1)
    #PLl = th.t(Sym) @ Ll;
    
    # build Ak, Al, Akl, invAkl, invAk, invAl

    #Ak = Lk@th.t(Lk);
    #Al = PLl@th.t(PLl);
    Akl = Ak+Al;
    
    invAkl = th.inverse(Akl);
    #invAk  = th.inverse(Ak);
    #invAl  = th.inverse(Al);
    
    # Overlap: (normalized)
    skl = 2**(n*1.5) * th.sqrt( th.pow(detLk*detLl/th.det(Akl) ,3) );

    # kinetic energy

    #tkl = skl*(6*th.trace(Mass@Ak@invAkl@Al));    
    tkl = skl*(6*th.sum(Mass*(Ak@invAkl@Al)))
    
    # potential energy
    
    TWOoSqrtPI = 1.1283791670955126 # 2/sqrt(pi)
    
    RIJ = th.zeros((n,n), device=device, dtype=dtype);
    # 1/rij i~=j
    for j in range(0,n-1):
        for i in range(j+1,n):
            tmp2 = invAkl[i,i] + invAkl[j,j] - 2*invAkl[i,j];
            #RIJ[i,j] = TWOoSqrtPI * skl/th.sqrt(tmp2);
            RIJ[i,j] = th.rsqrt(tmp2)


    # 1/rij i=j
    for i in range(0,n):
        #RIJ[i,i] = TWOoSqrtPI * skl/th.sqrt(invAkl[i,i]);
        RIJ[i,i] = th.rsqrt(invAkl[i,i])
    
    RIJ = TWOoSqrtPI*skl*RIJ
    
    #Q = vech2L(vecQ,n);

    vkl = th.sum(RIJ*Qmat)

    #hkl = tkl + vkl
    
    # Gradient Terms
    
    #gradient with respect to vechLk vechLl
    #checkdsk = vech( 3/2 * skl * (th.diag(1/th.diag(Lk)) - 2*invAkl@Lk) )
    #checkdsl = vech( 3/2 * skl * (th.diag(1/th.diag(Ll)) - 2*Sym@invAkl@PLl) )
    
    #dsk = th.autograd.grad(skl, vechLk, retain_graph=True)  
    #dsl = th.autograd.grad(skl, vechLl, retain_graph=True)
    
    #dtk = th.autograd.grad(tkl, vechLk, retain_graph=True)
    #dtl = th.autograd.grad(tkl, vechLl, retain_graph=True)
    
    #dvk = th.autograd.grad(vkl, vechLk, retain_graph=True)
    #dvl = th.autograd.grad(vkl, vechLl, retain_graph=True)
    
    #dhk = dtk[0] + dvk[0]
    #dhl = dtl[0] + dvl[0]
    
    #chkdhk = th.autograd.grad(hkl, vechLk, retain_graph=True)
    #chkdhl = th.autograd.grad(hkl, vechLl)
    
    return {'skl':skl, 'tkl':tkl, 'vkl':vkl#, 'hkl':hkl,
            #'dsk':dsk, 'dsl':dsl, 
            #'checkdsk':checkdsk, 'checkdsl':checkdsl,
            #'dtk':dtk, 'dtl':dtl, 'dvk':dvk, 'dvl':dvl,
            #'dhk':dhk, "dhl":dhl,
            #'chkdhk':chkdhk, "chkdhl":chkdhl
           }


In [46]:
def test_matel():
    n = 3;
    vechLk = th.tensor([  1.00000039208682, 
              0.02548044275764261, 
              0.3525161612610669,
              1.6669144815242515,
              0.9630555318946559,
              1.8382882034659822 ], device=device, dtype=dtype, requires_grad=True);
    
    vechLl = th.tensor([  1.3353550436464964,
               0.9153272033682132,
               0.7958636766525028,
               1.8326931436447955,
               0.3450426931160630,
               1.8711839323167831 ], device=device, dtype=dtype, requires_grad=True);
    
    Sym = th.tensor([[0,0,1],
                    [0,1,0],
                    [1,0,0]], device=device, dtype=dtype);
    
    Mass = th.tensor([[5.446170e-4, 2.723085077e-4, 2.723085077e-4],
                     [2.723085077e-4, .5002723085, 2.723085077e-4],
                     [2.723085077e-4, 2.723085077e-4, .5002723085 ]], device=device, dtype=dtype);
    
    vecQ = th.tensor([1, -1, -1, -1, 1, -1], device=device, dtype=dtype);
    
    matels = py_matel(n, vechLk, vechLl, Sym, Mass, vecQ)
    
    print('skl: ',matels['skl'])
    print('tkl: ',matels['tkl'])
    print('vkl: ',matels['vkl'])
    #print('hkl: ',matels['hkl'])
    #print('dsk: ',matels['dsk'])
    #print('dsl: ',matels['dsl'])
    #print('checkdsk: ',matels['checkdsk'])
    #print('checkdsl: ',matels['checkdsl'])
    #print('dhk: ',matels['dhk'])
    #print('dhl: ',matels['dhl'])
    #print('dtk: ',matels['dtk'])
    #print('dtl: ',matels['dtl'])
    #print('dvk: ',matels['dvk'])
    #print('dvl: ',matels['dvl'])
    #print('dhk: ',matels['dhk'])
    #print('dhl: ',matels['dhl'])
    #print('chkdhk: ',matels['chkdhk'])
    #print('chkdhl: ',matels['chkdhl'])


In [47]:
start_time = time.time()
test_matel()
print(" took {} seconds ".format(time.time() - start_time))


skl:  tensor(0.5334, dtype=torch.float64, grad_fn=<MulBackward>)
tkl:  tensor(4.3509, dtype=torch.float64, grad_fn=<ThMulBackward>)
vkl:  tensor(-2.3840, dtype=torch.float64, grad_fn=<SumBackward0>)
 took 0.009138822555541992 seconds 


## Energy Calculation

 **energyrc**: returns the energy Rayleigh quotient c'Hc/c'Sc
 in a basis of simple correlated Gaussians. The minimum of hte Rayleigh quotent 
 is the minimum of the smallest eigenvalue of the matrix representation of the Schrodinger euqation,
 (H-eS)c = 0 It is simpiler to compute than the full set of eigenvalues and verctors. A good optimization will determine the linear coeffients c i.e the eigenvector of the enery e.

* x:		the optimization parameters
* n:		the number of pseudo particles (size of Lk)
* nb:		the number of basis functions

* the first n(n+1)/2 * nb elements of x are exponent parameters (vechLk)
* the last nb elements of x are the linear coeff's c

In [146]:
def py_energyrc(x,n,nb,Mass,Charge,Sym,symc):
    
    nx = len(x);
    nn = int(n*(n+1)/2);
    nsym = len(symc);
    
    # extract linear coefs "eigen vector"
    c = x[-nb:];
    # reshape non-linear variables for easier indexing
    X = th.reshape(x[:nb*nn], (nb,nn))
    #npX = X.detach().numpy()
    L = th.zeros((nb,n,n), device=device, dtype=dtype)
    for i in range(nb):
        #L[i][np.tril_indices(n)]=npX[i,:]
        L[i][:,:] = vech2L(X[i,:],n)
    #L = th.from_numpy(L)
    detL = th.abs(th.prod(th.diagonal(L, offset=0, dim1=-1, dim2=-2),1))
    A = th.matmul(L,th.transpose(L, 1, 2))
    
    # Build H and S
    H = th.zeros((nb,nb), device=device, dtype=dtype);
    S = th.zeros((nb,nb), device=device, dtype=dtype);
    T = th.zeros((nb,nb), device=device, dtype=dtype);
    V = th.zeros((nb,nb), device=device, dtype=dtype);
    #dS = th.zeros((nb,nb, nn), device=device, dtype=dtype);
    #dT = th.zeros((nb,nb, nn), device=device, dtype=dtype);
    #dV = th.zeros((nb,nb, nn), device=device, dtype=dtype);
    
    # outer loop is over symmetry terms
    for k in range(0,nsym):
        P = Sym[:,:,k]
        PA = th.matmul(th.matmul(P,th.transpose(A, 1, 2)), th.t(P))
        for j in range(0,nb):
            for i in range(j,nb):
                #idxi = i*nn;
                #idxj = j*nn;
                
                #vechLi = x[idxi:idxi+nn];
                #vechLj = x[idxj:idxj+nn];
                Ai = A[i,:,:]
                Aj = PA[j,:,:]
                detLi = detL[i]
                detLj = detL[j]
                
                matels = py_matel(n,Ai,Aj,detLi,detLj,Sym[:,:,k],Mass,Charge);
                
                S[i,j] += symc[k]*matels['skl'];
                T[i,j] += symc[k]*matels['tkl'];
                V[i,j] += symc[k]*matels['vkl'];
                #dS[i,j,:] += symc[k]*matels['dsk'][0]
                #dS[j,i,:] += symc[k]*matels['dsl'][0]
                #dT[i,j,:] += symc[k]*matels['dtk'][0]
                #dT[j,i,:] += symc[k]*matels['dtl'][0]
                #dV[i,j,:] += symc[k]*matels['dvk'][0]
                #dV[j,i,:] += symc[k]*matels['dvl'][0]
                
    H = T + V
    #dH = dT + dV
    
    # complete upper triangle of H and S
    for i in range(0,nb):
        for j in range(i+1,nb):
            H[i,j] = H[j,i];
            S[i,j] = S[j,i];
            
    # and the energy is:
    #c = x[-nb:];
    #cHc = th.t(c)@H@c;
    #cSc = th.t(c)@S@c;
    cHc = c@H@c;
    cSc = c@S@c;
    eng = cHc/cSc;
    #cc = th.ger(c,c)
    #C = 2*cc - th.diag(th.diag(cc))
    #G = dH - eng*dS
    #for i in range(nn):
    #    G[:,:,i] *= C
    #G = th.sum(G,0).view(-1)
    #return (eng, G)
    #print(th.autograd.grad(eng, x , retain_graph=True))
    return eng           

In [154]:
def test_energyrc():
        
    n=3;
    nb=8;
    
    Mass = th.tensor([[0.5, 0.0, 0.0],
                     [0.0, 0.5, 0.0],
                     [0.0, 0.0, 0.5]], device=device, dtype=dtype);
    
    Charge = th.tensor([-3, 1, 1, -3, 1, -3], device=device, dtype=dtype);
    Charge = vech2L(Charge,n)
    
    # symmetry projection terms
    Sym = th.zeros((3,3,6), device=device, dtype=dtype)
    # (1)(2)(3)
    Sym[:,:,0] = th.tensor([[1,0,0],[0,1,0],[0,0,1]], device=device, dtype=dtype);
    # (12)
    Sym[:,:,1] = th.tensor([[0,1,0],[1,0,0],[0,0,1]], device=device, dtype=dtype);
    # (13)
    Sym[:,:,2] = th.tensor([[0,0,1],[0,1,0],[1,0,0]], device=device, dtype=dtype);
    # (23)
    Sym[:,:,3] = th.tensor([[1,0,0],[0,0,1],[0,1,0]], device=device, dtype=dtype);
    # (123)
    Sym[:,:,4] = th.tensor([[0,1,0],[0,0,1],[1,0,0]], device=device, dtype=dtype);
    # (132)
    Sym[:,:,5] = th.tensor([[0,0,1],[1,0,0],[0,1,0]], device=device, dtype=dtype);

    # coeff's
    symc = th.tensor([4.0,4.0,-2.0,-2.0,-2.0,-2.0], device=device, dtype=dtype);

    
    xvechL=th.tensor([
     1.6210e+00,
    -2.1504e-01,
     9.0755e-01,
     9.7866e-01,
    -2.8418e-01,
    -3.5286e+00,
    -3.3045e+00,
    -4.5036e+00,
    -3.2116e-01,
    -7.1901e-02,
     1.5167e+00,
    -8.4489e-01,
    -2.1377e-01,
    -3.6127e-03,
    -5.3774e-03,
    -2.1263e+00,
    -2.5191e-01,
     2.1235e+00,
    -2.1396e-01,
    -1.4084e-03,
    -1.0092e-02,
     4.5349e+00,
     9.4837e-03,
     1.1225e+00,
    -2.1315e-01,
     5.8451e-02,
    -4.9410e-03,
     5.0853e+00,
     7.3332e-01,
     5.0672e+00,
    -2.1589e-01,
    -6.8986e-03,
    -1.4310e-02,
     1.5979e+00,
     3.3946e-02,
    -8.7965e-01,
    -1.1121e+00,
    -2.1903e-03,
    -4.6925e-02,
     2.1457e-01,
     3.3045e-03,
     4.5120e+00,
    -2.1423e-01,
    -1.6493e-02,
    -2.3429e-03,
    -8.6715e-01,
    -6.7070e-02,
     1.5998e+00
     ], device=device, dtype=dtype, requires_grad=False)
    
    evec = th.tensor([
      -6.0460e-02,
       7.7708e-05,
       1.6152e+00,
       9.5443e-01,
       1.1771e-01,
       3.2196e+00,
       9.6344e-01,
       3.1398e+00
    ], device=device, dtype=dtype, requires_grad=False)
    
    #x1 = th.tensor(th.cat((xvechL,evec)), device=device, dtype=dtype, requires_grad=True)
    n=3;
    nb=96;
    th.manual_seed(42)
    #x1 = th.randn(int(nb*n*(n+1)/2 + nb) , device=device, dtype=dtype, requires_grad=True)
    x1 = xrestart
    #print(x1)
    #energy, G = py_energyrc(x1,n,nb,Mass,Charge,Sym,symc)
    #energy = py_energyrc(x1,n,nb,Mass,Charge,Sym,symc)
    #print(energy)
    #print(th.autograd.grad(energy, x1))
    
    #optimizer = th.optim.LBFGS([x1])
    optimizer = th.optim.Adadelta([x1], lr=0.5)
    #optimizer = th.optim.Adam([x1], lr=0.1)
    
    #scheduler = th.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True,patience=2)
    
    for i in range(2):
        optimizer.zero_grad()
        loss = py_energyrc(x1,n,nb,Mass,Charge,Sym,symc)
        loss.backward()
        #def closure():
        #    return py_energyrc(x1,n,nb,Mass,Charge,Sym,symc)
        optimizer.step()
        #scheduler.step(loss)
        
        print('step: {} f: {} gradNorm: {}'.format(i, loss, th.norm(x1.grad)))
    
    return x1
    #from scipy import optimize
    #eps = th.tensor(1e-08)
    #optimize.approx_fprime(x1, py_energyrc, eps, n,nb,Mass,Charge,Sym,symc)

In [155]:
start_time = time.time()
xrestart = test_energyrc()
print(" took {} seconds ".format(time.time() - start_time))
#cProfile.run('xrestart = test_energyrc()')

step: 0 f: -7.460833513991575 gradNorm: 0.17131797946997435
step: 1 f: -7.4619883653930215 gradNorm: 0.11479722281867605
 took 52.56346678733826 seconds 


Exception in callback BaseAsyncIOLoop._handle_events(11, 1)
handle: <Handle BaseAsyncIOLoop._handle_events(11, 1)>
Traceback (most recent call last):
  File "/home/kinghorn/anaconda3/envs/pytorch/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/kinghorn/anaconda3/envs/pytorch/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "/home/kinghorn/anaconda3/envs/pytorch/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/kinghorn/anaconda3/envs/pytorch/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/kinghorn/anaconda3/envs/pytorch/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/kinghorn/anaconda3/envs/pytorch/lib/python3.6/site-packages/

Doing 2 optimization steps  initial time 42 sec

Changing Charge to a matrix removed 25000 ops and took time to 31.4 sec

converted vevhL to L before matel time now 22.8 sec

convert trace to sum for tkl, 1/2 to 0.5, remove 1 abs()  -> 21.9 sec

batched abs(det(Lk))  -> 21.5 sec

batched Ak, PAlP' -> 19.8 sec

th.rsqrt(invAkl) -> 21.3 (timeing has changed with new restart pt!)

In [152]:
th.save(xrestart, 'Libo-nb96-7.460.pt')

In [147]:
xrestart = th.load('Libo-nb96-7.457.pt')
