In [1]:
class Session:
    session = None
    def __init__(self):
        self.all_tensors = []
        
    def add_tensor(self, tensor):
        self.all_tensors.append(tensor)
        
    def clear(self):
        for tensor in self.all_tensors:
            tensor.clear()
            
    def gradients(self):
        tensors_to_gradients = {}
        for tensor in self.all_tensors:
            if not isinstance(tensor, Variable):
                continue
            tensors_to_gradients[tensor] = tensor.CE_gradient()
        return tensors_to_gradients
    
    def learn(self, learning_rate):
        for (tensor, gradient) in self.gradients().items():
            tensor.set_value(tensor.value() - gradient * learning_rate)
            
    def reset(self):
        self.all_tensors = []
    
    
    @classmethod
    def get(clazz):
        if clazz.session is None:
            clazz.session = Session()
        return clazz.session

In [2]:
import numpy as np

class SigmoidTensor:
    def __init__(self, input_tensor):
        self.input_tensor = input_tensor
        self.subsequent_tensors = []
        input_tensor.subsequent_tensors.append(self)
        self._value = None
        self._CE_gradient = None
        Session.get().add_tensor(self)
    
    def value(self):
        if self._value is None:
            self._value = 1 / (1 + np.exp(-self.input_tensor.value()))
        return self._value
    
    def CE_gradient(input_tensor):
        if self._CE_gradient is not None: return self._CE_gradient
        if input_tensor != self.input_tensor:
            raise 'Unknown input tensor'
        if len(self.subsequent_tensors) != 1:
            raise 'Incorrect number of subsequent tensors'
        self._CE_gradient = self.value() * (1 - self.value()) * self.subsequent_tensors[0].CE_gradient(self)
        return self._CE_gradient
    
    def clear(self):
        self._value = None
        self._CE_gradient = None

In [3]:
import pdb

class AddTensor:
    def __init__(self, input_tensor1, input_tensor2):
        self.input_tensor1 = input_tensor1
        self.input_tensor2 = input_tensor2
        self.subsequent_tensors = []
        input_tensor1.subsequent_tensors.append(self)
        input_tensor2.subsequent_tensors.append(self)
        self._value = None
        self._CE_gradient = None
        Session.get().add_tensor(self)
    
    def value(self):
        if self._value is None:
            self._value = self.input_tensor1.value() + self.input_tensor2.value()
        return self._value
    
    def CE_gradient(self, input_tensor):
        if input_tensor not in [self.input_tensor1, self.input_tensor2]:
            raise 'Unknown input tensor'
        if len(self.subsequent_tensors) != 1:
            raise 'Incorrect number of subsequent tensors'
        if self._CE_gradient is not None: return self._CE_gradient
        self._CE_gradient = self.subsequent_tensors[0].CE_gradient(self)
        return self._CE_gradient
    
    def clear(self):
        self._value = None
        self._CE_gradient = None

In [4]:
class Placeholder:
    def __init__(self):
        self.subsequent_tensors = []
        self._value = None
        Session.get().add_tensor(self)
    
    def set_value(self, value):
        self._value = value
        
    def value(self):
        return self._value
    
    def clear(self):
        pass

In [5]:
p1 = Placeholder()
p2 = Placeholder()
result = SigmoidTensor(AddTensor(p1, p2))

In [6]:
p1.set_value(np.array([0,-5,5]))
p2.set_value(np.array([0,5,8]))

In [7]:
result.value()

array([0.5       , 0.5       , 0.99999774])

In [8]:
class MatrixMultiplyTensor:
    def __init__(self, input_tensor1, input_tensor2):
        self.input_tensor1 = input_tensor1
        self.input_tensor2 = input_tensor2
        self.subsequent_tensors = []
        input_tensor1.subsequent_tensors.append(self)
        input_tensor2.subsequent_tensors.append(self)
        self._value = None
        self._CE_gradient1 = None
        self._CE_gradient2 = None
        Session.get().add_tensor(self)
    
    def value(self):
        if self._value is None:
            self._value = np.matmul(self.input_tensor1.value(), self.input_tensor2.value())
        return self._value
    
    def CE_gradient1(self):
        if self._CE_gradient1 is not None: return self._CE_gradient1
        if len(self.subsequent_tensors) != 1:
            raise 'Incorrect number of subsequent tensors'
        self._CE_gradient1 = np.dot(self.input_tensor2.value(), self.subsequent_tensors[0].CE_gradient(self))
        return self._CE_gradient1

    def CE_gradient2(self):
        if self._CE_gradient2 is not None: return self._CE_gradient2
        if len(self.subsequent_tensors) != 1:
            raise 'Incorrect number of subsequent tensors'
        self._CE_gradient2 = np.outer(self.input_tensor1.value(), self.subsequent_tensors[0].CE_gradient(self))
        return self._CE_gradient2
    
    def CE_gradient(self, input_tensor):
        if input_tensor is self.input_tensor1: return self.CE_gradient1()
        if input_tensor is self.input_tensor2: return self.CE_gradient2()
        raise 'Unknown input tensor'
        
    def clear(self):
        self._value = None
        self._CE_gradient1 = None
        self._CE_gradient2 = None

In [9]:
class Variable:
    def __init__(self, initial_value):
        self.subsequent_tensors = []
        self._value = initial_value
        Session.get().add_tensor(self)
    
    def set_value(self, value):
        self._value = value
        
    def value(self):
        return self._value
    
    def CE_gradient(self):
        if len(self.subsequent_tensors) != 1:
            raise 'Incorrect number of subseqeuent tensors'
        return self.subsequent_tensors[0].CE_gradient(self)
    
    def clear(self):
        pass

In [22]:
def softmax(logits):
    # TODO: Not numerically stable therefore useless.
    return np.exp(logits) / np.sum(np.exp(logits))

def log_softmax(logits):
    return logits - np.log(np.sum(np.exp(logits)))

def softmax(logits):
    return np.exp(log_softmax(logits))

class SoftmaxCELoss:
    def __init__(self, input_tensor, y):
        self.input_tensor = input_tensor
        self.y = y
        self._value = None
        self._CE_gradient = None
        input_tensor.subsequent_tensors.append(self)
        Session.get().add_tensor(self)
        
    def value(self):
        if self._value is None:
            self._value = -np.dot(log_softmax(self.input_tensor.value()), self.y.value())
        return self._value
    
    def CE_gradient(self, input_tensor):
        if input_tensor != self.input_tensor:
            raise 'unknown input tensor'
        if self._CE_gradient is None:
            self._CE_gradient = softmax(self.input_tensor.value()) - self.y.value()
        return self._CE_gradient
    
    def clear(self):
        self._value = None
        self._CE_gradient = None

In [23]:
from keras.datasets import mnist
from keras.utils import to_categorical

(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = np.reshape(x_train, (-1, 28*28)) / 255

x_test = np.reshape(x_test, (-1, 28*28)) / 255

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)


In [24]:
x_train.shape

(60000, 784)

In [43]:
Session.get().reset()
W1 = Variable(np.random.normal(size=(784, 10)) / np.sqrt(784))
b1 = Variable(np.zeros((10,)))
x = Placeholder()
y = Placeholder()
z1a = MatrixMultiplyTensor(x, W1)
z1b = AddTensor(z1a, b1)
ce = SoftmaxCELoss(z1b, y)

In [74]:
def train(learning_rate):
    ce_value = 0
    for (x_value, y_value) in zip(x_train, y_train):
        Session.get().clear()
        x.set_value(x_value)
        y.set_value(y_value)
        Session.get().learn(learning_rate/x_train.shape[0])
        ce_value += ce.value()
    return ce_value/x_train.shape[0]

In [75]:
def get_accuracy():
    correctCount = 0
    for (x_value, y_value) in zip(x_train, y_train):
        Session.get().clear()
        x.set_value(x_value)
        y.set_value(y_value)
        if (np.argmax(z1b.value()) == np.argmax(y_value)):
            correctCount += 1
    return correctCount/x_train.shape[0]

In [77]:
for _ in range(100):
    print(train(0.1))
    print(get_accuracy())

1.0884800636575505
0.7953
1.0699529404109978
0.7975666666666666
1.0523691733731426
0.8001333333333334
1.0356604399953195
0.8023333333333333
1.019764495167759
0.8045833333333333
1.0046245596875538
0.8069166666666666
0.9901887735316132
0.8091333333333334
0.9764097074415681
0.8109
0.9632439267548177
0.81245
0.9506516019003959
0.8142833333333334
0.9385961604808717
0.8157
0.9270439763558406
0.8173166666666667
0.9159640916151639
0.8184333333333333
0.9053279677693089
0.8198333333333333
0.8951092628890619
0.8208666666666666
0.8852836317935575
0.8220333333333333
0.875828546715781
0.8232166666666667
0.8667231361704577
0.8245333333333333
0.8579480400120191
0.8256666666666667
0.849485278903912
0.8269333333333333
0.8413181366266131
0.8282333333333334
0.8334310538342946
0.8294166666666667
0.8258095320307643
0.8303
0.8184400466764397
0.8312
0.8113099684634868
0.8320666666666666
0.8044074919053517
0.83295
0.7977215704838732
0.8337666666666667
0.7912418576821887
0.8342833333333334
0.7849586533063324
0.

KeyboardInterrupt: 