In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
print(torch.cuda.is_available())
if torch.cuda.is_available() and False:
    print ("cuda in use")
    device = torch.device('cuda') 
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    dtype = torch.cuda.FloatTensor
else:
    print ("cuda not used")
    device = torch.device('cpu')
    torch.set_default_tensor_type('torch.FloatTensor')
    dtype = torch.float

True
cuda not used


In [119]:
#from T. Kohonen '84, p. 183, equation 6.33

#Set alpha to one for one-step orthogonalization.
class Novelty_Filter():
    
    def __init__(self,size,epsilon=1e-5):
        self.O = torch.eye(size, requires_grad=False, dtype=dtype, device=device)
        self.epsilon = epsilon
        return
        
    def addBasis(self,X,cf=1.):
        l = self.novelty(X)
        n = l.norm()
        print("norms",n,X.norm())
        if(cf * n > self.epsilon * X.norm()):
            print("suppressing")
            self.O -= cf * l.ger(l) / n.pow(2)
        return
    
#     def suppressNovelty(self,X,cf=1.):
#         #Memorize this (linear) pattern
#         n = torch.norm(X)
#         if (n > self.epsilon):
#             print("suppressing",X,"norm",n)
#             self.O -= cf * X.ger(X) / n.pow(2)
#         return
    
    def novelty(self,X):
        return self.O @ X
    
    def project(self,X):
        return X - self.novelty(X)

In [120]:
def selu(x):
    alpha = 1.6732632423543772848170429916717
    scale = 1.0507009873554804934193349852946
    return scale * F.elu(x, alpha)

class Xor(nn.Module):
    
    def __init__(self):
        super(Xor, self).__init__() 
        hidden_nodes = 100
        self.l1 = nn.Linear(3,hidden_nodes,bias=False)
        self.l1.weight.data.normal_(0.0, np.sqrt(1./3.))
        self.head = nn.Linear(hidden_nodes, 1,bias=False)
        self.head.weight.data.normal_(0.0, np.sqrt(1./hidden_nodes))
        
    def forward(self, inputs):
        self.inputs = inputs
        self.l1_out = selu(self.l1(inputs))
        self.value = torch.sigmoid(self.head(self.l1_out))
        return self.value
    
    def get_loss(self,target):
        self.loss = F.mse_loss(target,self.value)
        return self.loss

class Stable_Xor(Xor):
    
    def __init__(self,alpha=0.):
        super(Stable_Xor, self).__init__()  
        self.alpha = alpha
        self.l1_size = self.l1.weight.size()
        self.l1_length = self.l1.in_features * self.l1.out_features
        self.l1_filter = Novelty_Filter(self.l1_length)
        self.head_size = self.head.weight.size()
        self.head_length = self.head.in_features * self.head.out_features
        self.head_filter = Novelty_Filter(self.head_length)
    
    def do_post_gradient(self):
        with torch.no_grad():
            self.l1_vector = self.l1.weight.view(self.l1_length)
            novelty = self.l1_filter.novelty(self.l1_vector)
            self.l1.weight.grad *= novelty.reshape(self.l1_size).detach()
            self.l2_vector = self.head.weight.view(self.head_length)
            novelty = self.head_filter.novelty(self.l2_vector)
            self.head.weight.grad *= novelty.reshape(self.head_size).detach()
#             self.grad1_vector = self.l1.weight.grad.view(self.l1_length)
#             novelty = self.l1_filter.novelty(self.grad1_vector)
#             self.l1.weight.grad = novelty.reshape(self.l1_size).detach()
#             self.grad2_vector = self.head.weight.grad.view(self.head_length)
#             novelty = self.head_filter.novelty(self.grad2_vector)
#             self.head.weight.grad = novelty.reshape(self.head_size).detach()
        
    def do_pre_update(self):
        certainty_factor = self.alpha/torch.exp(abs(self.loss.data))
        print("Critic cf:",certainty_factor)
        with torch.no_grad():
            self.l1_filter.addBasis(self.l1_vector,certainty_factor)
            self.head_filter.addBasis(self.l2_vector,certainty_factor)
        
data = torch.tensor([[1,0,0,0],[1,0,1,1],[1,1,0,1],[1,1,1,0]], requires_grad=False, dtype=dtype, device=device)
print("data",data)

data tensor([[1., 0., 0., 0.],
        [1., 0., 1., 1.],
        [1., 1., 0., 1.],
        [1., 1., 1., 0.]])


In [121]:
    
def showme():
    print("Current Forecasts")
    for j in range(4):
        print("input",data[j][0:3],"output",xor(data[j][0:3]).data,"target",data[j][3:4])
    print("End")
    
xor = Stable_Xor(1.)
xopt = optim.Adam(xor.parameters(), lr=1e-1,weight_decay=0.000001)
iterations=100


print("---")
for j in range(iterations):
    xor(data[0][0:3])
    xor.get_loss(data[0][3:4]).backward()
    xor.do_post_gradient()
    xopt.step()
    xopt.zero_grad()

print("input",data[0][0:3],"output",xor(data[0][0:3]).data,"target",data[0][3:4])
    
xor(data[0][0:3])
loss=xor.get_loss(data[0][3:4])
print("loss",loss,xor.value,data[0][3:4])
loss.backward()
xor.do_post_gradient()
xor.do_pre_update()
xopt.step()
xopt.zero_grad()

showme()
for j in range(iterations):
    xor(data[1][0:3])
    xor.get_loss(data[1][3:4]).backward()
    xor.do_post_gradient()
    xopt.step()
    xopt.zero_grad()
    
print("input",data[1][0:3],"output",xor(data[1][0:3]).data,"target",data[1][3:4])
xor(data[1][0:3])
loss=xor.get_loss(data[1][3:4])
print("loss",loss,xor.value,data[1][3:4])
loss.backward()
xor.do_post_gradient()
xor.do_pre_update()
xopt.step()
xopt.zero_grad()

showme()
for j in range(iterations):
    xor(data[2][0:3])
    xor.get_loss(data[2][3:4]).backward()
    xor.do_post_gradient()
    xopt.step()
    xopt.zero_grad()
    
print("input",data[2][0:3],"output",xor(data[2][0:3]).data,"target",data[2][3:4])
xor(data[2][0:3])
loss=xor.get_loss(data[2][3:4])
print("loss",loss,xor.value,data[2][3:4])
loss.backward()
xor.do_post_gradient()
xor.do_pre_update()
xopt.step()
xopt.zero_grad()
showme()
for j in range(iterations):
    xor(data[3][0:3])
    xor.get_loss(data[3][3:4]).backward()
    xor.do_post_gradient()
    xopt.step()
    xopt.zero_grad()
    
print("input",data[3][0:3],"output",xor(data[3][0:3]).data,"target",data[3][3:4])
xor(data[3][0:3])
loss=xor.get_loss(data[3][3:4])
print("loss",loss,xor.value,data[3][3:4])
loss.backward()
xor.do_post_gradient()
xor.do_pre_update()
xopt.step()
xopt.zero_grad()
    
showme()

---
input tensor([1., 0., 0.]) output tensor([1.]) target tensor([0.])
loss tensor(1., grad_fn=<MeanBackward0>) tensor([1.], grad_fn=<SigmoidBackward>) tensor([0.])
Critic cf: tensor(0.3679)
norms tensor(8.1988) tensor(8.1988)
suppressing
norms tensor(6.7700) tensor(6.7700)
suppressing
Current Forecasts
input tensor([1., 0., 0.]) output tensor([1.]) target tensor([0.])
input tensor([1., 0., 1.]) output tensor([1.]) target tensor([1.])
input tensor([1., 1., 0.]) output tensor([1.]) target tensor([1.])
input tensor([1., 1., 1.]) output tensor([1.]) target tensor([0.])
End
input tensor([1., 0., 1.]) output tensor([1.]) target tensor([1.])
loss tensor(0., grad_fn=<MeanBackward0>) tensor([1.], grad_fn=<SigmoidBackward>) tensor([1.])
Critic cf: tensor(1.)
norms tensor(4.2137) tensor(6.5418)
suppressing
norms tensor(3.6762) tensor(5.6981)
suppressing
Current Forecasts
input tensor([1., 0., 0.]) output tensor([1.]) target tensor([0.])
input tensor([1., 0., 1.]) output tensor([1.]) target tenso

In [110]:
xor(data[1][0:3])
loss=xor.get_loss(data[1][3:4])
print("loss",loss,xor.value,data[1][3:4])
loss.backward()
xor.do_post_gradient()
xor.do_pre_update()
xopt.step()
xopt.zero_grad()

for j in range(iterations):
    xor(data[2][0:3])
    xor.get_loss(data[2][3:4]).backward()
    xor.do_post_gradient()
    xopt.step()
    xopt.zero_grad()

print("input",data[2][0:3],"output",xor(data[2][0:3]).data,"target",data[2][3:4])
# print("novelty of input, is",xor.l1_filter.novelty(xor.inputs))
# print("novelty of hidden, is",xor.head_filter.novelty(xor.l1_out))

xor(data[2][0:3])
loss=xor.get_loss(data[2][3:4])
print("loss",loss,xor.value,data[2][3:4])
loss.backward()
xor.do_post_gradient()
xor.do_pre_update()
xopt.step()
xopt.zero_grad()

for j in range(iterations):
    xor(data[3][0:3])
    xor.get_loss(data[3][3:4]).backward()
    xor.do_post_gradient()
    xopt.step()
    xopt.zero_grad()

print("input",data[3][0:3],"output",xor(data[3][0:3]).data,"target",data[3][3:4])
# print("novelty of input, is",xor.l1_filter.novelty(xor.inputs))
# print("novelty of hidden, is",xor.head_filter.novelty(xor.l1_out))

xor(data[3][0:3])
loss=xor.get_loss(data[3][3:4])
print("loss",loss,xor.value,data[3][3:4])
    
print("final")
for j in range(4):
    print("input",data[j][0:3],"output",xor(data[j][0:3]).data,"target",data[j][3:4])
#     print("novelty of input, is",xor.l1_filter.novelty(xor.inputs))
#     print("novelty of hidden, is",xor.head_filter.novelty(xor.l1_out))
    
print(xor.l1_filter.O)
print(xor.head_filter.O[0:5])

loss tensor(1., grad_fn=<MeanBackward0>) tensor([2.0785e-09], grad_fn=<SigmoidBackward>) tensor([1.])
Critic cf: tensor(0.3679)
norms tensor(15.2625) tensor(0.0003)
suppressing
norms tensor(22.7214) tensor(0.0003)
suppressing
input tensor([1., 1., 0.]) output tensor([0.]) target tensor([1.])
loss tensor(1., grad_fn=<MeanBackward0>) tensor([0.], grad_fn=<SigmoidBackward>) tensor([1.])
Critic cf: tensor(0.3679)
norms tensor(46.8379) tensor(0.0003)
suppressing
norms tensor(19.6946) tensor(0.0002)
suppressing
input tensor([1., 1., 1.]) output tensor([0.]) target tensor([0.])
loss tensor(0., grad_fn=<MeanBackward0>) tensor([0.], grad_fn=<SigmoidBackward>) tensor([0.])
final
input tensor([1., 0., 0.]) output tensor([5.6292e-11]) target tensor([0.])
input tensor([1., 0., 1.]) output tensor([1.3473e-30]) target tensor([1.])
input tensor([1., 1., 0.]) output tensor([0.]) target tensor([1.])
input tensor([1., 1., 1.]) output tensor([0.]) target tensor([0.])
tensor([[ 1.0000e+00,  3.4445e-28, -3.

In [111]:
    
f = Novelty_Filter(3)
X = torch.tensor([7,.1,11], requires_grad=False, dtype=dtype, device=device)
X2 = torch.tensor([7,-3,.1], requires_grad=False, dtype=dtype, device=device)
v = torch.ones([1], requires_grad=False, dtype=dtype, device=device)
print(X)
alpha = 0.2
for i in range(10):
    certainty_factor = alpha/torch.exp(abs(i*.1*v))
    print(certainty_factor)
    f.addBasis(X,certainty_factor)
    print(f.novelty(X),f.novelty(X2))
    print(torch.norm(X),torch.norm(f.novelty(X)))

tensor([ 7.0000,  0.1000, 11.0000])
tensor([0.2000])
norms tensor(13.0388) tensor(0.0001)
suppressing
tensor([5.6000, 0.0800, 8.8000]) tensor([ 6.5899, -3.0059, -0.5444])
tensor(13.0388) tensor(10.4310)
tensor([0.1810])
norms tensor(10.4310) tensor(0.0001)
suppressing
tensor([3.6207, 0.0517, 5.6896]) tensor([ 6.0101, -3.0141, -1.4555])
tensor(13.0388) tensor(6.7442)
tensor([0.1637])
norms tensor(6.7442) tensor(0.0001)
suppressing
tensor([-0.6637, -0.0095, -1.0430]) tensor([ 4.7551, -3.0321, -3.4277])
tensor(13.0388) tensor(1.2363)
tensor([0.1482])
norms tensor(1.2363) tensor(0.0001)
suppressing
tensor([-116.0302,   -1.6576, -182.3331]) tensor([-29.0385,  -3.5148, -56.5319])
tensor(13.0388) tensor(216.1275)
tensor([0.1341])
norms tensor(216.1275) tensor(0.0001)
suppressing
tensor([-116.0336,   -1.6576, -182.3385]) tensor([-29.0395,  -3.5148, -56.5335])
tensor(13.0388) tensor(216.1339)
tensor([0.1213])
norms tensor(216.1339) tensor(0.0001)
suppressing
tensor([-116.0367,   -1.6577, -182.3

In [74]:
# xor = Xor()
# xopt = optim.Adam(xor.parameters(), lr=3e-2,weight_decay=0.00001)

# print(data)
# for i in range(1000):
#     for j in range(4):
#         xor(data[j][0:2])
#         xor.get_loss(data[j][2:3]).backward()
#     xopt.step()
#     xopt.zero_grad()

# for j in range(4):
#     print(xor(data[j][0:2]).data,data[j][2:3])
    
# xor = Xor()
# xopt = optim.Adam(xor.parameters(), lr=3e-2,weight_decay=0.00001)

# print("---")
# for i in range(1000):
#     for j in range(4):
#         xor(data[j][0:2])
#         xor.get_loss(data[j][2:3]).backward()
#         xopt.step()
#         xopt.zero_grad()
        
# for j in range(4):
#     print(xor(data[j][0:2]).data,data[j][2:3])


In [None]:
    
#from T. Kohonen '84, p. 119, equation 4.63, p. 122, equation 4.68
class Adaptive_Novelty(Fast_Novelty):
    
    def __init__(self,size,alpha=0.95,gamma=1e-3):
        super(Adaptive_Novelty,self).__init__(size,alpha)
        self.alpha = alpha
        self.gamma = gamma
        
    def addBasis(self,X):
        X = X.data
        #Slowly learn to remember this (linear) pattern
        O2 = self.O.pow(2)
        self.O -= self.alpha*O2*X.ger(X)*O2
        #Very gradually forget everything we've learned
        if self.gamma > 0.:
            self.O += self.gamma * (self.O - O2)
        return

In [None]:
O = torch.eye(3, requires_grad=False, dtype=dtype, device=device)
a = 0.003
g = 0.01
X = torch.tensor([7,0.17,11], requires_grad=False, dtype=dtype, device=device).t()
X2 = torch.tensor([7,-3,.1], requires_grad=False, dtype=dtype, device=device).t()
O2 = np.square(O)
print(a*(O2*X*X.t()*O2))
print(a*(O2*(X*X.t())*O2))
print(a*O2*X.ger(X)*O2)
for i in range(1000):
    n = O @ X
    O2 = np.square(O)
    O -= a*O2*X.ger(X)*O - g * (O - O2)
    
#     O = O - a*O2*X.ger(X)*O + g * (O - O2)
print(X,X2)
print(O @ X, O@X2)
print(X - O@X, X2 - O@X2)

print('---')
# O = torch.eye(3, requires_grad=False, dtype=dtype, device=device)
for i in range(1000):
    n = O @ X2
    O2 = np.square(O)
    O -= a*O2*X2.ger(X2)*O - g * (O - O2)
print(X,X2)
print(O @ X, O@X2)
print(X - O@X, X2 - O@X2)

In [None]:
X = torch.tensor([7,0.17,11], requires_grad=False, dtype=dtype, device=device).t()
X2 = torch.tensor([7,-3,.1], requires_grad=False, dtype=dtype, device=device).t()
filter = Adaptive_Novelty(3,alpha=0.01,gamma=5e-2)
filter.addBasis(X)
print(filter.novelty(X2),filter.novelty(filter.novelty(filter.novelty(X2))))
filter.addBasis(X2)

filter2 = Adaptive_Novelty(3,alpha=0.01,gamma=0.)
print(filter2.novelty(X2))
filter2.addBasis(X)
print(filter2.novelty(X2))
filter2.addBasis(filter2.novelty(X2))
print(filter.O,filter2.O)
print(filter.project(X2),filter2.project(X2))

In [None]:
X = torch.tensor([1,0,0], requires_grad=False, dtype=dtype, device=device).t()
X2 = torch.tensor([0,1,1], requires_grad=False, dtype=dtype, device=device).t()
O = torch.eye(3, requires_grad=False, dtype=torch.float)
a = 7e-3
print(O @ X,O@X2)
d=O@X
d2=O@X2
for i in range(100000):
    n = O @ X
    O2 = O.pow(2)
    O = O - a*O2*X.ger(X)*O2
print(O @ X, O@X2)
print(X - O@X, X2 - O@X2)
print(O@X/d)
print(O@X2/d2)
# for i in range(10000):
#     n = O @ X2
#     O2 = O.pow(2)
#     O = O - a*O2*X2.ger(X2)*O2
# print(O @ X, O@X2)
# print(X - O@X, X2 - O@X2)



In [None]:
ortho = Orthogonalizer(3)
print(ortho.novelty(X),ortho.novelty(X2))
ortho.addBasis(X2)
print(ortho.novelty(X),ortho.novelty(X2))
print(ortho.project(X),ortho.project(X2))

Y = torch.stack([X,X,X]).numpy()
Y2 = torch.stack([X2,X2,X2]).numpy()
print(Y,Y2)
Q,r = np.linalg.qr(Y)
print(Q@Q.T)
print(ortho.O)
print(Q @ Q.T @ Y)#Projection from Q
print(Q @ Q.T @ Y2)#Projection from Q

In [None]:
ortho = Forgetalizer(3,alpha=0.99)
# ortho = Adaptive_Novelty(3)
print(ortho.alpha,ortho.gamma)
VX = torch.tensor([1,0.0,1], requires_grad=False, dtype=dtype, device=device).t()
V1 = torch.tensor([0,1,0], requires_grad=False,, dtype=dtype, device=device).t()
print(V1)
print(ortho.O)
print(ortho.novelty(V1))
print(ortho.project(V1))
ortho.addBasis(V1)
print(ortho.novelty(V1))
print(ortho.project(V1))
print(ortho.O)


for i in range(100):
    ortho.addBasis(V1)
print(ortho.novelty(V1))
print(ortho.project(V1))
for i in range(1000):
    ortho.addBasis(VX)
print(ortho.novelty(V1))
print(ortho.project(V1))

In [None]:

V1 = torch.tensor([[1,0,0],[0,1,0]], requires_grad=False, dtype=dtype, device=device).t()
ortho = Orthogonalizer(3)
print(ortho.novelty(V1))
for x in V1.t():
    print(ortho.novelty(x.unsqueeze(0).t()))
for x in ortho.novelty(V1).t():
    print(x.unsqueeze(0).t())


In [None]:

I = torch.eye(3, requires_grad=False, dtype=dtype, device=device)
V1 = torch.tensor([1,0.5,0.5], requires_grad=False, dtype=dtype, device=device)
W = torch.eye(3, requires_grad=True, dtype=dtype, device=device)
print(V1)
print(W)
O = W @ V1
print(O,O.norm())
optimizer = optim.Adam({W}, lr=1e-4)
    
i = 0
while True:
    i += 1
    optimizer.zero_grad()
    O = W @ V1
    loss = O.norm() 
    if i % 1000 == 0:
        print(i,loss)
    if(loss < 0.0012):
        break
    loss.backward()                                      
    optimizer.step()
print(i,W,O)
V2 = torch.tensor([0,1,0], requires_grad=False, dtype=dtype, device=device)
i = 0
while True:
    i += 1
    optimizer.zero_grad()
    O = W @ V2
    loss = O.norm() 
    if i % 1000 == 0:
        print(i,loss)
    if(loss < 0.0012):
        break
    loss.backward()                                      
    optimizer.step()
print(W@V1,W@V2)
print(W)

In [None]:
ortho = Orthogonalizer(3)
V1 = torch.tensor([0.5,0,0.1], requires_grad=False, dtype=dtype, device=device).t()
V2 = torch.tensor([1,0,1], requires_grad=False, dtype=dtype, device=device).t()
V3 = torch.tensor([1,0.5,1], requires_grad=False, dtype=dtype, device=device).t()
print("n v1",ortho.novelty(V1))
print(ortho.O)
print("n v1",ortho.novelty(V1))
print("n v2",ortho.novelty(V2))
print("V1",V1)
print("add V1")
ortho.addBasis(V1)
print(ortho.O)
print("p v1",ortho.project(V1))
print("n v1",ortho.novelty(V1))
print("n v2",ortho.novelty(V2))
print("add V1")
ortho.addBasis(V1)
print(ortho.novelty(V1))
print("add V1")
ortho.addBasis(V1)
print(ortho.novelty(V1))
print("add V1")
ortho.addBasis(V1)
print(ortho.novelty(V1))
print("add V1")
ortho.addBasis(V1)
print(ortho.novelty(V1))
print("add V1")
ortho.addBasis(V1)
print("O",ortho.O)
print(ortho.novelty(V1))
print(ortho.novelty(V2))
print(ortho.novelty(V3))
print(ortho.project(V1))
print(ortho.project(V2))
print(ortho.project(V3))

ortho.addBasis(V2)
print(ortho.novelty(V1))
print(ortho.novelty(V2))
print(ortho.novelty(V3))
print(ortho.project(V1))
print(ortho.project(V2))
print(ortho.project(V3))

In [None]:
ortho.addBasis(V1)
print(ortho.novelty(V1))
print(ortho.novelty(V2))
print(ortho.novelty(V3))
ortho.addBasis(V2)
print(ortho.novelty(V1))
print(ortho.novelty(V2))
print(ortho.novelty(V3))
ortho.addBasis(V3)
print(ortho.novelty(V1))
print(ortho.novelty(V2))
print(ortho.novelty(V3))
print(ortho.O)

In [None]:
print(V1,V1.t())
print(ortho.O)
print(ortho.project(V1))
print(ortho.project(V2))
ortho.addBasis(V1)
print(ortho.O)
print(ortho.project(V1))
print(ortho.project(V2))
ortho.addBasis(V2)
print(ortho.O)
print(ortho.project(V1))
print(ortho.project(V2))
ortho.addBasis(V2)
print(ortho.O)
print(ortho.project(V1))
print(ortho.project(V2))

In [None]:
ortho = Orthogonalizer(3)
V1 = torch.tensor([[0.5,0.25,0.25]], requires_grad=False, dtype=dtype, device=device).t()
print("V1.p",ortho.project(V1))
print("V1.n",ortho.novelty(V1))
print(ortho.O)
ortho.addBasis(V1)
print(ortho.O)
print("V1.p",ortho.project(V1))
print("V1.n",ortho.novelty(V1))
ortho.addBasis(V1)
print(ortho.O)
print("V1.p",ortho.project(V1))
print("V1.n",ortho.novelty(V1))
ortho.addBasis(V1)
print(ortho.O)
print("V1.p",ortho.project(V1))
print("V1.n",ortho.novelty(V1))
ortho.addBasis(V1)
print(ortho.O)
print("V1.p",ortho.project(V1))
print("V1.n",ortho.novelty(V1))

In [None]:
V2 = torch.tensor([[1,3,4]], requires_grad=False, dtype=dtype, device=device).t()
print("x",ortho.novelty(V2))
ortho.addBasis(V2)
print(ortho.project(V2))
ortho.addBasis(V2)
print(ortho.project(V2))
ortho.addBasis(V2)
print(ortho.project(V2))
ortho.addBasis(V2)
print(ortho.project(V2))
print(ortho.project(V1))

##### gram-schmidt orthogonalization. Add a new basis one column at a time.
class Orthogonalizer:
    def __init__(self,size,thresh=1e-10):
        self.O = torch.eye(size, requires_grad=False, dtype=torch.float)
        self.O2 = torch.eye(size, requires_grad=False, dtype=torch.float)
        self.threshold = thresh
        return
    def novelty(self,X):
        return torch.mm(self.O,X)
    def project(self,X):
        return X - self.novelty(X)
    def addBasis(self,X):
        for x in X.t():
            a = self.novelty(x.unsqueeze(0).t())
            if (torch.abs(a).max() > self.threshold):
                self.O -= torch.div(torch.mm(a,a.t()),torch.norm(a).pow(2))
        return
    
#gram-schmidt orthogonalization. Add a new basis one column at a time.
class Orthogonalizer2:
    def __init__(self,size,thresh=1e-10):
        self.O = torch.eye(size, requires_grad=False, dtype=torch.float)
        self.O2 = torch.eye(size, requires_grad=False, dtype=torch.float)
        self.threshold = thresh
        return
    def novelty(self,X):
        return torch.mm(self.O,X)
    def project(self,X):
        return torch.div(torch.mm(self.O,X),self.O)
    def addBasis(self,X):
        for x in X.t():
            a = self.novelty(x.unsqueeze(0).t())
            if (torch.abs(a).max() > self.threshold):
                self.O -= torch.div(torch.mm(a.t(),a),torch.norm(a).pow(2))
        return
    
class Forgetful_Orthogonalizer(Orthogonalizer):
    def addBasis(self,X):
        #Reduce magnitude of prior orthoganlizations 
        super(Forgetful_Orthogonalizer, self).addBasis(X)
        self.O = torch.tanh(self.O - self.O2) + self.O2
        return
    
def compare(V,ortho,fortho):
    ortho.addBasis(V)
    fortho.addBasis(V)
    print('Value {}\tNovelty: {}\tF Novelty: {}\tProjection: {}\tF Projection {}'.format(
        V, ortho.novelty(V),fortho.novelty(V),ortho.project(V),fortho.project(V)))
def compare2(V,ortho):
    ortho.addBasis(V)
    Q,_ = np.linalg.qr(V)
    print('\tO {}\t Q P{}\tValue {}\tNovelty: {}\tProjection: {}'.format(
        ortho.O,Q,V, ortho.novelty(V),ortho.project(V),fortho.project(V)))

def make_householder(a):
    v = a / (a[0] + np.copysign(np.linalg.norm(a), a[0]))
    v[0] = 1
    H = np.eye(a.shape[0])
    H -= (2 / np.dot(v, v)) * np.dot(v[:, None], v[None, :])
    return H
def householder_v(a):
    """Use this version of householder to reproduce the output of np.linalg.qr 
    exactly (specifically, to match the sign convention it uses)

    based on https://rosettacode.org/wiki/QR_decomposition#Python
    """
    v = a / (a[0] + np.copysign(np.linalg.norm(a), a[0]))
    v[0] = 1
    tau = 2 / (v.T @ v)

    return v,tau
    
ortho = Orthogonalizer(3)
fortho = Forgetful_Orthogonalizer(3)
V1 = torch.tensor([[0.5,0.25,0.25]], requires_grad=False, dtype=torch.float).t()
print("x")
print(ortho.project(V1))
print(ortho.novelty(V1))
ortho.addBasis(V1)
print(ortho.project(V1))
ortho.addBasis(V1)
print(ortho.project(V1))
ortho.addBasis(V1)
print(ortho.project(V1))
ortho.addBasis(V1)
print(ortho.project(V1))

In [None]:
V1 = torch.tensor([[0.5,0.25,0.25]], requires_grad=False, dtype=torch.float).t()
V2 = torch.tensor([[2,3,4]], requires_grad=False, dtype=torch.float).t()
Q,r = np.linalg.qr(V1)
print("Qr",np.linalg.qr(V1))
print("householder",householder_v(V1.numpy()))
# print("householder2",make_householder(V1.numpy()))
h,r2 = householder_v(V1.numpy())
Q = torch.tensor(Q)
print("Qt.Q",Q @ Q.t())
print("Q.Qt",torch.mm(Q,Q.t()))
print("Qt.Q",torch.mm(Q.t(),Q))
print(Q @ Q.t() @ V1)#Projection from Q
print(h @ h.T @ V1.numpy())#Projection from h
print(V1 - torch.mm(torch.mm(Q,Q.t()),V1))#Novelty from Q
print("")


Q,r = np.linalg.qr(V2)
Q = torch.tensor(Q)
print("Q",Q)
print("Q.Qt",torch.mm(Q,Q.t()))
print("Qt.Q",torch.mm(Q.t(),Q))
print(torch.mm(torch.mm(Q,Q.t()),V2))#Projection from Q
print(V2 - torch.mm(torch.mm(Q,Q.t()),V2))#Novelty from Q
print("")


print(torch.mm(torch.mm(Q,Q.t()),V1))#Projection from Q
print(V1 - torch.mm(torch.mm(Q,Q.t()),V1))#Novelty from Q
print("")

In [None]:
B: tensor([[3., 1., 3., 3., 3., 3.],
        [3., 3., 6., 4., 4., 4.],
        [4., 3., 5., 5., 5., 5.]]), projection: tensor([[5.9071, 2.0330, 7.4282, 5.7807, 5.7807, 5.7807],
        [4.1150, 2.9704, 7.2674, 4.8306, 4.8306, 4.8306],
        [1.4824, 1.9224, 1.4342, 2.4399, 2.4399, 2.4399]])

def gram_schmidt(A):
    """Orthogonalize a set of vectors stored as the columns of matrix A."""
    # Get the number of vectors.
    n = A.shape[1]
    for j in range(n):
        # To orthogonalize the vector in column j with respect to the
        # previous vectors, subtract from it its projection onto the
        # each of the previous vectors.
        for k in range(j):
            A[:, j] -= np.dot(A[:, k], A[:, j]) * A[:, k]
        A[:, j] = A[:, j] / np.linalg.norm(A[:, j])
    return A
A = np.array([[1.0, 1.0, 0.0], [1.0, 3.0, 1.0], [2.0, -1.0, 1.0]])
# print(gram_schmidt(A))

def gram_schmidt_columns(X):
    Q, R = np.linalg.qr(X)
    return Q,R
Q,R = gram_schmidt_columns(V4)
print("zz",V4.numpy().dot(R.T))
print(V4)
print('sss',gram_schmidt_columns(V4))
z = Orthogonalizer(3)
z.addBasis(V4)
print(Q.dot(Q.T).dot(V4))
print(V4.numpy()-Q.dot(Q.T).dot(V4))
print(z.novelty(V4))
print(z.project(V4))

In [4]:
eps = 1e-12
#            certainty_factor = self.alpha/torch.exp(abs(self.loss))
for i in range(-6,6):
    X = torch.tensor([i], requires_grad=False, dtype=dtype, device=device)
    print(i,1./torch.exp(abs(X)))

-6 tensor([0.0025])
-5 tensor([0.0067])
-4 tensor([0.0183])
-3 tensor([0.0498])
-2 tensor([0.1353])
-1 tensor([0.3679])
0 tensor([1.])
1 tensor([0.3679])
2 tensor([0.1353])
3 tensor([0.0498])
4 tensor([0.0183])
5 tensor([0.0067])


In [13]:
1/torch.exp(abs(torch.tensor([1e-13], requires_grad=False, dtype=dtype, device=device)))

tensor([1.])

In [89]:

xor = Stable_Xor(1.)
print(xor.l1.weight)
xor(data[0][0:3])
xor.get_loss(data[0][3:4]).backward()
print(xor.l1.weight.grad.view(xor.l1_length).norm())
print(xor.l1_filter.novelty(xor.l1.weight.grad.view(xor.l1_length)).norm())
print(xor.l1_filter.project(xor.l1.weight.grad.view(xor.l1_length)).norm())
# print(xor.l1_filter.O)
xor.l1_filter.addBasis(xor.l1.weight.grad.view(xor.l1_length),1.)
# print(xor.l1_filter.O)
print(xor.l1_filter.novelty(xor.l1.weight.grad.view(xor.l1_length)).norm())
print(xor.l1_filter.project(xor.l1.weight.grad.view(xor.l1_length)).norm())
xor.l1_filter.addBasis(xor.l1.weight.grad.view(xor.l1_length),1.)
# print(xor.l1_filter.O)
print(xor.l1_filter.novelty(xor.l1.weight.grad.view(xor.l1_length)).norm())
print(xor.l1_filter.project(xor.l1.weight.grad.view(xor.l1_length)).norm())

Parameter containing:
tensor([[ 7.4518e-01, -2.9972e-01,  1.5403e-01],
        [ 7.7383e-01,  1.5247e-01, -2.9849e-01],
        [ 1.3003e-01, -1.8735e-01,  7.7313e-01],
        [ 1.0315e+00, -6.5747e-01, -7.7077e-01],
        [ 4.6050e-01, -4.6900e-01,  8.4203e-01],
        [ 1.7573e+00,  1.0854e+00, -4.3304e-01],
        [-2.3718e-01,  7.7311e-01, -4.1140e-01],
        [-6.9385e-01,  8.8059e-01,  9.6271e-01],
        [-2.3377e-01,  5.3642e-01, -3.8494e-01],
        [ 9.3499e-01,  5.7672e-01,  8.4462e-01],
        [-8.7055e-01, -7.5306e-01,  4.6861e-02],
        [ 3.2025e-02, -2.1910e-01, -4.0968e-01],
        [ 8.6351e-01,  3.4322e-01,  1.5330e+00],
        [ 7.5326e-02,  3.9386e-01,  5.3867e-02],
        [-2.4905e-01, -7.0694e-01,  8.3978e-01],
        [ 1.1292e+00,  1.7568e-01,  1.1436e+00],
        [-1.7406e-01, -1.2942e-01,  1.3245e+00],
        [-2.0527e-01, -3.2130e-02,  5.3246e-01],
        [ 1.3162e-01, -9.5116e-01,  3.9188e-01],
        [ 3.3760e-01, -1.4998e-01,  7.7361e-01]