In [4]:
import torch
import numpy as np
from numpy.core._multiarray_umath import ndarray
from torch import LongTensor as LT
import os
import sys
import tangent
import warnings
warnings.filterwarnings("ignore")
sys.path.append('./..')


# ========================================
# the columns are the older basis vectors for the qr function ; thus transpose
# ========================================
def gramSchmidt(V):
    from scipy.linalg import qr
    _basis, _ = qr(V.transpose(),mode='economic')
    return _basis.transpose()

# -----------------------------------------------------
# Reduce the avg cosine loss between W and (x1*x2)
# Such that W. (x1x2) is maximized
# -----------------------------------------------------
def cosine_loss(X, Y):
    xnorm = np.sqrt(np.sum(X*X))
    ynorm = np.sqrt(np.sum(Y*Y))
    similarity = np.sum(X*Y) / (xnorm * ynorm)
    return 1 - similarity

# =====================================================================
# Combine Projected GD and Confidence weighted GD 
# =====================================================================
class onlineGD:
    def __init__(self, num_coeff, emb_dim):
        self.num_coeff = num_coeff
        self.coeff_mask: ndarray = np.zeros(num_coeff)
        self.prior_grad_vectors = {
            k: [] for k in range(num_coeff)
        }
        self.W_cur = None
        self.emb_dim = emb_dim
        self.gradient_fn = tangent.grad(cosine_loss, verbose = False)
        self.W_orig = None
        return

    def set_original_W(self, W):
        self.W_orig = W
        self.W_cur = W
        
    # ------------------------------------
    # list_feature_mod_idx: A list of list of indices for each sample.
    # empty list for a sample means no explanation
    # signs :
    # ------------------------------------
    def update_weight(
            self,
            label = [],
            list_feature_mod_idx = [],
            X = None
    ):
        update_mask = []
        W = self.W_cur
        
        emb_dim = self.emb_dim
        for _label,_feat_idx in zip(label, list_feature_mod_idx):

            _mask = self.coeff_mask.copy()
            # Update on the positive labels only
            if _label == 1:
                for _f in _feat_idx:
                    _mask[_f] = 1
            update_mask.append(_mask)
        update_mask = np.array(update_mask)
        
        print(update_mask)
        num_samples = update_mask.shape[0]

        # Output mask shape : num_samples, num_coeff, coeff_dim
        output_mask = np.broadcast_to(update_mask.reshape([ update_mask.shape[0],update_mask.shape[1],1]), [update_mask.shape[0], update_mask.shape[1], emb_dim])
        # tiled_W shape: [ Num_samples, num_coeff, coeff_dim ]
        tiled_W = np.tile(W.reshape([1,W.shape[0],W.shape[1]] ),(num_samples ,1,1))
        
        gradient_values = np.zeros(tiled_W.shape)
        for i in range(num_samples):
            for j in range(num_coeff):
                g =  self.gradient_fn (tiled_W[i][j],x[i][j])
                g = update_mask[i][j] * g
                gradient_values[i][j]=g
        divisor = np.sum(update_mask,axis=0)
        divisor = np.reciprocal(divisor)
        divisor =  np.where(divisor == np.inf, 0, divisor)
        divisor = divisor.reshape([-1,1])
        # --------------------------------
        # Average gradients over the batch
        
        avg_gradients = np.multiply(np.sum(gradient_values,axis=0), divisor)
        
        
        # =================================
        # Calculate the projection of current gradient on each of the prior gradients for the same term 
        # =================================
        coeff_update_flag = np.sum(update_mask,axis=0)
        coeff_update_flag = np.where(coeff_update_flag > 0, True, False )
        cur_gradient = avg_gradients 
        sum_grad_projections = [] 

        # ==================================
        # Create orthonormal basis if and only more than 2 prior vectors available
        # ==================================
        for i in range(num_coeff):
            _x = cur_gradient[i]
            # IF no update needed, store 0
            if not coeff_update_flag[i]:
                g_proj_i = np.zeros(_x.shape)
                sum_grad_projections.append(g_proj_i)
                continue
                
            # Gram Scmidt process : get the bases
            bases =  np.array( self.prior_grad_vectors[i])
            
            if bases.shape[0] >  1:
                bases = gramSchmidt(bases)
                g_proj_i = np.zeros(_x.shape)
                # Add up sum of all projections
                for orth_base in bases:
                    _g_proj = np.dot(_x, orth_base) / np.linalg.norm(orth_base) * orth_base
                    g_proj_i += _g_proj
            else:
                g_proj_i = _x
            sum_grad_projections.append(g_proj_i)
        # --------
        # Add up the multiple projections
        sum_grad_projections = np.array(sum_grad_projections)
        final_gradient = sum_grad_projections   
        
        # Save avg_gradients
        for i in range(num_coeff):
            if coeff_update_flag[i]:
                self.prior_grad_vectors[i].append(avg_gradients[i])
                
                
        # Update the weights
        W = W - final_gradient
        self.W_cur = W
        return final_gradient, W


In [13]:
num_coeff = 6
emb_dim = 4
W = np.random.random([num_coeff,emb_dim])
x = np.random.random([4,num_coeff,emb_dim])


In [14]:



obj = onlineGD(num_coeff,emb_dim)
obj.set_original_W(W)

In [15]:
obj.update_weight(
    [1,1,0,0],
    [(1,2),(3,),(),()],
    x
)


[[0. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


(array([[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.19448528, -0.01524198, -0.60491012, -0.35236459],
        [ 0.29224515, -0.13482543, -0.3727943 , -0.36146961],
        [-0.12759546, -0.10227231,  0.27686705, -0.36118204],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]]),
 array([[0.81977526, 0.35695937, 0.01644474, 0.79943619],
        [0.59786294, 0.86302818, 0.63315193, 0.70454064],
        [0.63945738, 0.65238747, 0.38299497, 0.91117632],
        [0.9648154 , 0.78576212, 0.65959916, 0.58973466],
        [0.37025492, 0.12512023, 0.65381574, 0.27593393],
        [0.6044633 , 0.53715762, 0.50104947, 0.40531476]]))

In [16]:
x = np.random.random([4,num_coeff,emb_dim])
obj.update_weight(
    [0,0,1,1],
    [(),(),(1,3),(4,)],
    x
)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]]


(array([[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.01803408,  0.02273411, -0.24438613,  0.17647168],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [-0.06388864,  0.04119739,  0.10523287, -0.0680681 ],
        [ 0.23717967, -0.61888577,  0.05226264, -0.16145877],
        [ 0.        ,  0.        ,  0.        ,  0.        ]]),
 array([[0.81977526, 0.35695937, 0.01644474, 0.79943619],
        [0.57982887, 0.84029406, 0.87753807, 0.52806896],
        [0.63945738, 0.65238747, 0.38299497, 0.91117632],
        [1.02870404, 0.74456473, 0.55436629, 0.65780277],
        [0.13307525, 0.74400599, 0.6015531 , 0.43739269],
        [0.6044633 , 0.53715762, 0.50104947, 0.40531476]]))

In [17]:
obj.prior_grad_vectors

{0: [],
 1: [array([ 0.19448528, -0.01524198, -0.60491012, -0.35236459]),
  array([ 0.01803408,  0.02273411, -0.24438613,  0.17647168])],
 2: [array([ 0.29224515, -0.13482543, -0.3727943 , -0.36146961])],
 3: [array([-0.12759546, -0.10227231,  0.27686705, -0.36118204]),
  array([-0.06388864,  0.04119739,  0.10523287, -0.0680681 ])],
 4: [array([ 0.23717967, -0.61888577,  0.05226264, -0.16145877])],
 5: []}

In [18]:
x = np.random.random([4,num_coeff,emb_dim])
obj.update_weight(
    [0,1,0,1],
    [(),(2,),(),(4,)],
    x
)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]]


(array([[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [-0.10445204,  0.14417577, -0.4597388 ,  0.16331834],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [-0.59917113,  0.16085843, -0.2051899 ,  0.19087615],
        [ 0.        ,  0.        ,  0.        ,  0.        ]]),
 array([[0.81977526, 0.35695937, 0.01644474, 0.79943619],
        [0.57982887, 0.84029406, 0.87753807, 0.52806896],
        [0.74390941, 0.50821169, 0.84273377, 0.74785798],
        [1.02870404, 0.74456473, 0.55436629, 0.65780277],
        [0.73224637, 0.58314757, 0.806743  , 0.24651654],
        [0.6044633 , 0.53715762, 0.50104947, 0.40531476]]))

In [334]:
x = np.random.random([4,num_coeff,emb_dim])
grad3 = obj.update_weight(
    [0,1,0,1],
    [(),(2,),(),(4,)],
    x
)


# =================================
# Calculate the projection of current gradient on each of the prior gradients for the same term 
# =================================
coeff_update_flag = np.sum(update_mask,axis=0)
coeff_update_flag = np.where(coeff_update_flag > 0, True, False )
print(coeff_update_flag)
cur_gradient = grad3 
sum_grad_projections = [] 

# ==================================
# Create orthonormal basis if and only more than 2 prior vectors available
# ==================================
for i in range(num_coeff):
    _x = cur_gradient[i]
    # IF no update needed, store 0
    if not coeff_update_flag[i]:
        g_proj_i = np.zeros(_x.shape)
        sum_grad_projections.append(g_proj_i)
        continue
    # Gram Scmidt process : get the bases
    bases =  np.array( prior_grad_vectors[i])
    if bases.shape[0] >  1:
        bases = gramSchmidt(bases)
        g_proj_i = np.zeros(_x.shape)
        # Add up sum of all projections
        for orth_base in bases:
            _g_proj = np.dot(_x, orth_base) / np.linalg.norm(orth_base) * orth_base
            g_proj_i += _g_proj
    else:
        g_proj_i = _x
    sum_grad_projections.append(g_proj_i)
# Add up the multiple projections
sum_grad_projections = np.array(sum_grad_projections)
final_gradient = sum_grad_projections

In [322]:
cur_gradient

array([[ 0.  ,  0.  ,  0.  ,  0.  ],
       [-0.25,  0.07,  0.1 ,  0.14],
       [ 0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.43, -0.42, -0.38,  0.12],
       [ 0.12, -0.04, -0.02, -0.18],
       [ 0.  ,  0.  ,  0.  ,  0.  ]])

In [315]:
from scipy.linalg import qr
qr(tmp[0].transpose(),mode='economic')

In [306]:
tmp = [[-0.44, -0.16, -0.2 ,  0.15],
       [ 0.16,  0.06, -0.21,  0.04],
]
tmp = np.array(tmp)

In [316]:
tmp[0].transpose()

array([-0.44, -0.16, -0.2 ,  0.15])

In [318]:
qr(tmp[0].transpose(),mode='economic')

ValueError: expected a 2-D array

0.9973464794142505

-0.001100000000000101

In [56]:
y1 = np.array([0.50,-0.250])
x0 = np.array([0.50, 0.250])
print(cosine_loss(x0,y1))
x1 = tangent_cosine_loss(x0,y1)
print(x0, x1)
x2 = x0 - x1
print(x2)
print(cosine_loss(x2,y1))
x2 = x0 - x1
x1 = tangent_cosine_loss(x2,y1)
x2 = x2 -x1
print(cosine_loss(x2,y1))

0.40000000000000013
[0.5  0.25] [-0.64  1.28]
[ 1.14 -1.03]
0.03652380413201883
0.012477723095580373


array([0., 1., 2., 1., 1., 0.])

In [153]:
W = np.random.random([6,emb_dim])
label =  [1,0,1,1]
list_feature_mod_idx = [(1,2),(),(2,3),(4,)]
coeff_mask = np.zeros(num_coeff)

In [154]:
update_mask = []
for _label,_feat_idx in zip(label, list_feature_mod_idx):
    _mask = coeff_mask.copy()
    print('>>', _feat_idx)
    # Update on the positive labels only
    if _label == 1:
        
        for _f in _feat_idx:
            _mask[_f] = 1
    update_mask.append(_mask)
update_mask = np.array(update_mask)
update_mask

>> (1, 2)
>> ()
>> (2, 3)
>> (4,)


array([[0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [201]:
np.set_printoptions(precision=2)

In [202]:
num_samples = update_mask.shape[0]

# Output mask shape : num_samples, num_coeff, coeff_dim
output_mask = np.broadcast_to(update_mask.reshape([ update_mask.shape[0],update_mask.shape[1],1]), [update_mask.shape[0], update_mask.shape[1], emb_dim])
# tiled_W shape: [ Num_samples, num_coeff, coeff_dim ]
tiled_W = np.tile(W.reshape([1,W.shape[0],W.shape[1]] ),(num_samples ,1,1))

grad_res = np.zeros(tiled_W.shape)
for i in range(num_samples):
    for j in range(num_coeff):
        g = tangent_cosine_loss(tiled_W[i][j],x[i][j])
        g = update_mask[i][j] * g
        grad_res[i][j]=g
divisor = np.sum(update_mask,axis=0)
divisor = np.reciprocal(divisor)
divisor =  np.where(divisor == np.inf, 0, divisor)
divisor = divisor.reshape([-1,1])
avg_gradients = np.multiply(np.sum(grad_res,axis=0), divisor)
avg_gradients


  from ipykernel import kernelapp as app


array([[ 0.00e+00,  0.00e+00,  0.00e+00,  0.00e+00],
       [ 3.86e-01, -2.90e-01, -3.79e-01, -1.50e-01],
       [ 1.85e-01, -1.48e-02, -5.68e-01, -7.95e-02],
       [ 9.63e-02, -7.29e-02,  1.03e-01, -6.63e-01],
       [-4.64e-02, -1.07e-03,  1.34e-01,  2.17e-04],
       [ 0.00e+00,  0.00e+00,  0.00e+00,  0.00e+00]])