In [45]:
class Tensor(object):

    
    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        
        self.data = np.array(data)
        self.creators = creators
        self.creation_op = creation_op
        self.grad = None
        self.autograd = autograd
        self.children = {}
        
        if id is None:
            id = np.random.randint(0, 100000)
        self.id = id
        
        if creators is not None:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
                    
                    
    def all_children_grads_accounted_for(self):
        for id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
        
        
    def backward(self, grad=None, grad_origin=None):
        if self.autograd:
            if grad is None:
                grad = Tensor(np.ones_like(self.data))
            
            if grad_origin is not None:
                if self.children[grad_origin.id] == 0:
                    raise Exception('cannot backprop more than once')
                else:
                    self.children[grad_origin.id] -= 1
            
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
                if self.creation_op == 'add':
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                
                if self.creation_op == 'neg':
                    self.creators[0].backward(self.grad.__neg__())
                    
                if self.creation_op == 'sub':
                    new = Tensor(self.grad.data)
                    self.creators[0].backward(new, self)
                    new = Tensor(self.grad.__neg__().data)
                    self.creators[1].backward(new, self)
                
                if self.creation_op == 'mul':
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)
                    
                if self.creation_op == 'mm':
                    act = self.creators[0]     # usually an activation
                    weights = self.creators[1] # usually a weight matrix
                    new = self.grad.mm(weights.transpose())
                    act.backward(new)
                    new = self.grad.transpose().mm(act).transpose()
                    weights.backward(new)

                if self.creation_op == 'transpose':
                    self.creators[0].backward(self.grad.transpose())
                    
                if 'sum' in self.creation_op:
                    dim = int(self.creation_op.split('_')[1])
                    ds = self.creators[0].data.shape[dim]
                    self.creators[0].backward(self.grad.expand(dim,ds))
                    
                if 'expand' in self.creation_op:
                    dim = int(self.creation_op.split('_')[1])
                    self.creators[0].backward(self.grad.sum(dim))
                    
                if self.creation_op == 'sigmoid':
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                    
                if self.creation_op == 'tanh':
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
    
                if self.creation_op == 'index_select':
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))

                if self.creation_op == 'cross_entropy':
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))


    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op='add')
        return Tensor(self.data + other.data)

    
    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op='neg')
        return Tensor(self.data * -1)
    
   
    def __sub__(self, other):
        if self.autograd and other.autograd: 
            return Tensor(self.data - other.data, 
                          autograd=True,
                          creators=[self,other],
                          creation_op="sub")
        return Tensor(self.data - other.data)

    
    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    
    def sum(self, dim):
        if self.autograd:
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))

    
    def expand(self, dim, copies):
        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape)
        new_data = new_data.transpose(trans_cmd)
        if self.autograd:
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dim))
        return Tensor(new_data)

    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")
        return Tensor(self.data.transpose())

    
    def mm(self, x):
        if self.autograd:
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self,x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    
    def sigmoid(self):
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op='sigmoid')
        return Tensor(1 / (1 + np.exp(-self.data)))
    
    
    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op='tanh')
        return Tensor(np.tanh(self.data))
    
    
    def softmax(self):
        e_x = np.exp(self.data - np.max(self.data))
        return e_x / e_x.sum(axis=0)

    
    def index_select(self, indices):
        if self.autograd:
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op='index_select')
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])
                                              
                                              
    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)
    
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()
        
        if self.autograd:
            out = Tensor(loss,
                         autograd=True,
                         creators=[self],
                         creation_op='cross_entropy')
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out
        return Tensor(loss)                                              
    
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    
    def __str__(self):
        return str(self.data.__str__())


# Stochastic gradient descent

class SGD(object):

    
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
        
    
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
            
    
    def step(self, zero=True):
        for p in self.parameters:
            p.data -= p.grad.data * self.alpha
            
            if zero:
                p.grad.data *= 0
               

# Layers
            
class Layer(object):
    
    
    def __init__(self):
        self.parameters = list()
        
    
    def get_parameters(self):
        return self.parameters
    

class Linear(Layer):

    
    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0 / n_inputs)
        self.weight = Tensor(W, autograd=True)
        self.parameters.append(self.weight)
        if bias:
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)
            self.parameters.append(self.bias)
        else:
            self.bias = None
    
    
    def forward(self, input):
        temp = input.mm(self.weight)
        if self.bias:
            temp += self.bias.expand(0, len(input.data))
        return temp
                

class Sequential(Layer):
    
    
    def __init__(self, layers=list()):
        super().__init__()
        self.layers = layers
        
        
    def add(self, layer):
        self.layers.append(layer)
        
        
    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input
    
    
    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params        
        
        
class Embedding(Layer):
    
    
    def __init__(self, vocab_size, dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dim = dim
        
        weight = (np.random.rand(vocab_size, dim) - 0.5) / dim
        self.weight = Tensor(weight, autograd=True)
        
        self.parameters.append(self.weight)
        
    
    def forward(self, input):
        return self.weight.index_select(input)


# Activate functions    
    
class Tanh(Layer):
    
    
    def __init__(self):
        super().__init__()
        
        
    def forward(self, input):
        return input.tanh()
    
    
class Sigmoid(Layer):
    
    
    def __init__(self):
        super().__init__()
        
    
    def forward(self, input):
        return input.sigmoid()
    

class CrossEntropyLoss(object):
    
    
    def __init__(self):
        super().__init__()
    
    
    def forward(self, input, target):
        return input.cross_entropy(target)    
    

# Recurrent cell
    
class RNNCell(Layer):
    
    
    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        if activation == 'sigmoid':
            self.activation = Sigmoid()
        elif activation == 'tanh':
            self.activation = Tanh()
        else:
            raise Exception('unknown non-linear function')
        
        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)
        
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()
        
        
    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden
    
    
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)


# Error
                
class MSELoss(Layer):
    
    
    def __init(self):
        super().__init__()
        
    
    def forward(self, pred, target):
        return ((pred - target)*(pred - target)).sum(0)

# start

In [11]:
import numpy as np
np.random.seed(0)


with open('shakespear.txt') as f:
    raw = f.read()
vocab = list(set(raw))
word2index = {word:i for i,word in enumerate(vocab)}
indices = np.array(list(map(lambda x: word2index[x], raw)))

In [12]:
indices

array([32,  2, 57, ..., 20,  4, 41])

In [13]:
embed = Embedding(vocab_size=len(vocab), dim=512)
model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

In [26]:
BATCH_SIZE = 32
N_BATCHES = int(indices.shape[0] / BATCH_SIZE)
BPTT = 16
N_BPTT = int((N_BATCHES - 1) / BPTT)

In [28]:
trimmed_indices = indices[:N_BATCHES * BATCH_SIZE]
batched_indices = trimmed_indices.reshape(BATCH_SIZE, N_BATCHES)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

input_batches = input_batched_indices[0:N_BPTT * BPTT]
input_batches = input_batches.reshape(N_BPTT, BPTT, BATCH_SIZE)
target_batches = target_batched_indices[0:N_BPTT * BPTT]
target_batches = target_batches.reshape(N_BPTT, BPTT, BATCH_SIZE)

In [25]:
target_batched_indices.shape

(3123, 32)

In [39]:
import sys


def generate_sample(n=30, init_char=' '):
    s = ''
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 10 # temperature for sampling, higher=greedier
        temp_dist = output.softmax()
        temp_dist = temp_dist / temp_dist.sum()
        #m = (temp_dist > np.random.rand()).argmax() # sample from pred
        m = output.data.argmax() # take max of predictio
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

def train(iterations=100):
    for iter in range(iterations):
        total_loss = 0
        n_loss = 0
        hidden = model.init_hidden(batch_size=BATCH_SIZE)
        for batch_i in range(len(input_batches)):
            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            losses = list()
            for t in range(BPTT):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if t == 0:
                    loss = batch_loss
                else:
                    loss = loss + batch_loss

            for loss in losses:
                ""
                    
            loss.backward()
            optim.step()
            total_loss += loss.data
            log = "\r Iter:" + str(iter)
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Loss:" + str(np.exp(total_loss / (batch_i+1)))
            if batch_i == 0:
                log += " - " + generate_sample(70,'\n').replace("\n"," ")
            if batch_i % 10 == 0 or batch_i-1 == len(input_batches):
                sys.stdout.write(log)
        optim.alpha *= 0.99
        print()

In [40]:
train()

  return e_x / e_x.sum(axis=0)


 Iter:0 - Batch 191/195 - Loss:77.654508708888079hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
 Iter:1 - Batch 191/195 - Loss:19.899136063229825 m  se ththt thethththththththththththththththththththththththththththt
 Iter:2 - Batch 191/195 - Loss:15.189637790997436 m  an the the the the the the the the the the the the the the the the 
 Iter:3 - Batch 191/195 - Loss:13.126814790770563il th the the the the the the the the the the the the the the the the 
 Iter:4 - Batch 191/195 - Loss:11.974559459653904dh the the the the the the the the the the the the the the the the the
 Iter:5 - Batch 191/195 - Loss:11.159960257881073h the the the the the the the the the the the the the the the the the
 Iter:6 - Batch 191/195 - Loss:10.539842860435703h the the the the the the the the the the the the the the the the the
 Iter:7 - Batch 191/195 - Loss:10.049136635870463dh the the the the the the the the the the the the the the the the the
 Iter:8 - Batch 191/195 - Loss:9.6339068

In [41]:
print(generate_sample(n=2000, init_char='\n'))

Bure too, in thee
Tur
Tarr agains; and I will line thou and so your juchave abon was gabe in?

LEY RATho I have my lord, and samose his head
Mear are: out the moundere of the moundere the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

CAES:
Th, I and I conflich
the nad speak:
He most the flace.

# SECTION 2: gradients

In [42]:
(sigmoid,relu)=(lambda x:1/(1+np.exp(-x)), lambda x:(x>0).astype(float)*x)
weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1,0.01]))
print("Sigmoid Activations")
activations = list()
for iter in range(10):
    activation = sigmoid(activation.dot(weights))
    activations.append(activation)
    print(activation)

print("\nSigmoid Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = (activation * (1 - activation) * gradient)
    gradient = gradient.dot(weights.transpose())
    print(gradient)

print("Activations")
activations = list()
for iter in range(10):
    activation = relu(activation.dot(weights))
    activations.append(activation)
    print(activation)

print("\nGradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = ((activation > 0) * gradient).dot(weights.transpose())
    print(gradient)

Sigmoid Activations
[0.93940638 0.96852968]
[0.9919462  0.99121735]
[0.99301385 0.99302901]
[0.9930713  0.99307098]
[0.99307285 0.99307285]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]

Sigmoid Gradients
[0.03439552 0.03439552]
[0.00118305 0.00118305]
[4.06916726e-05 4.06916726e-05]
[1.39961115e-06 1.39961115e-06]
[4.81403643e-08 4.81403637e-08]
[1.65582672e-09 1.65582765e-09]
[5.69682675e-11 5.69667160e-11]
[1.97259346e-12 1.97517920e-12]
[8.45387597e-14 8.02306381e-14]
[1.45938177e-14 2.16938983e-14]
Activations
[4.8135251  4.72615519]
[23.71814585 23.98025559]
[119.63916823 118.852839  ]
[595.05052421 597.40951192]
[2984.68857188 2977.61160877]
[14895.13500696 14916.36589628]
[74560.59859209 74496.90592414]
[372548.22228863 372739.30029248]
[1863505.42345854 1862932.18944699]
[9315234.18124649 9316953.88328115]

Gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15625.]
[78125. 78125.]


# SECTION 3: LSTM

In [60]:
class Tensor (object):
    def __init__(self,data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creators = creators
        self.creation_op = creation_op
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.is_recurrent = False
        if(id is None):
            id = np.random.randint(0,100000)
        self.id = id
        if(creators is not None):
            for c in creators:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
   
    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True
   
    def backward(self,grad=None, grad_origin=None):
        if(self.autograd):
            
            if(grad is None):
                grad = Tensor(np.ones_like(self.data))
            
            if(grad_origin is not None):
                if (self.children[grad_origin.id] == 0):
                    return
                else:
                    self.children[grad_origin.id] -= 1
            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad
                
            if(self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None)):
                if(self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                if(self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())
                if(self.creation_op == "sub"):
                    new = Tensor(self.grad.data)
                    self.creators[0].backward(new, self)
                    new = Tensor(self.grad.__neg__().data)
                    self.creators[1].backward(new, self)
                if(self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)
                if(self.creation_op == "mm"):
                    act = self.creators[0] # usually an activation
                    weights = self.creators[1] # usually a weight matrix
                    new = self.grad.mm(weights.transpose())
                    act.backward(new)
                    new = self.grad.transpose().mm(act).transpose()
                    weights.backward(new)
                if(self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())
                if("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    ds = self.creators[0].data.shape[dim]
                    self.creators[0].backward(self.grad.expand(dim,ds))
                if("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                if(self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                if(self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                if(self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))
                if(self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))
    
    def __add__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data + other.data, autograd=True, creators=[self,other], creation_op="add")
        return Tensor(self.data + other.data)
   
    def __neg__(self):
        if(self.autograd):
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __sub__(self, other):
        if(self.autograd and other.autograd): 
            return Tensor(self.data - other.data, autograd=True, creators=[self,other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data * other.data, autograd=True, creators=[self,other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def __truediv__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data / other.data, autograd=True, creators=[self,other], creation_op="div")
        return Tensor(self.data / other.data)

    def sum(self, dim):
        if(self.autograd):
            return Tensor(self.data.sum(dim), autograd=True, creators=[self], creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))

    def expand(self, dim,copies):
        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape)
        new_data = new_data.transpose(trans_cmd)
        if(self.autograd):
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        if(self.autograd):
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())

    def mm(self, x):
        if(self.autograd):
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self,x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if(self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)),
            autograd=True,
            creators=[self],
            creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if(self.autograd):
            return Tensor(np.tanh(self.data),
            autograd=True,
            creators=[self],
            creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    
    def softmax(self):
        e_x = np.exp(self.data - np.max(self.data))
        return e_x / e_x.sum(axis=0)
    
    def index_select(self, indices):
        if(self.autograd):
            new = Tensor(self.data[indices.data],
            autograd=True,
            creators=[self],
            creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])
    
    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
        axis=len(self.data.shape)-1,
        keepdims=True)
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)
    
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()
        if(self.autograd):
            out = Tensor(loss,
            autograd=True,
            creators=[self],
            creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out
        return Tensor(loss)
        
    def __repr__(self):
        return str(self.data.__repr__())
    def __str__(self):
        return str(self.data.__str__())

class Layer(object):
    def __init__(self):
        self.parameters = list()
    def get_parameters(self):
        return self.parameters

class Linear(Layer):
    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.parameters.append(self.weight)
        if bias:
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)
            self.parameters.append(self.bias)
        else:
            self.bias = None
    
    def forward(self, input):
        temp = input.mm(self.weight)
        if self.bias:
            temp += self.bias.expand(0,len(input.data))
        return temp
    
class Embedding(Layer):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.dim = dim
        # this initialiation style is just a convention from word2vec
        weight = (np.random.rand(vocab_size, dim) - 0.5) / dim
        self.weight = Tensor(weight, autograd=True)
        self.parameters.append(self.weight)
    def forward(self, input):
         return self.weight.index_select(input)
    
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()
    def forward(self, input, target):
        return input.cross_entropy(target)

class RNNCell(Layer):
    def __init__(self, n_inputs,n_hidden,n_output,activation='sigmoid'):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        if(activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif(activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")
        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()
        
    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size,self.n_hidden)),autograd=True)

class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    def forward(self, input):
        return input.sigmoid()

class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
    def step(self, zero=True):
        for p in self.parameters:
            p.data -= p.grad.data * self.alpha
            if(zero):
                p.grad.data *= 0

In [61]:
class LSTMCell(Layer):
    
    
    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)
        self.xc = Linear(n_inputs, n_hidden)
        self.hf = Linear(n_hidden, n_hidden, bias=False)
        self.hi = Linear(n_hidden, n_hidden, bias=False)
        self.ho = Linear(n_hidden, n_hidden, bias=False)
        self.hc = Linear(n_hidden, n_hidden, bias=False)
        self.w_ho = Linear(n_hidden, n_output, bias=False)
        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()
        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()
        self.parameters += self.ho.get_parameters()
        self.parameters += self.hc.get_parameters()
        self.parameters += self.w_ho.get_parameters()
    
    
    def forward(self, input, hidden):
        prev_hidden = hidden[0]
        prev_cell = hidden[1]
        f = (self.xf.forward(input)+self.hf.forward(prev_hidden)).sigmoid()
        i = (self.xi.forward(input)+self.hi.forward(prev_hidden)).sigmoid()
        o = (self.xo.forward(input)+self.ho.forward(prev_hidden)).sigmoid()
        g = (self.xc.forward(input) +self.hc.forward(prev_hidden)).tanh()
        c = (f * prev_cell) + (i * g)
        h = o * c.tanh()
        h.is_recurrent = True
        output = self.w_ho.forward(h)
        return output, (h, c)
    
    
    def init_hidden(self, batch_size=1):
        h = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        c = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        h.data[:,0] += 1
        c.data[:,0] += 1
        return (h, c)

In [62]:
BATCH_SIZE = 16
N_BATCHES = int(indices.shape[0] / BATCH_SIZE)
BPTT = 25
N_BPTT = int((N_BATCHES - 1) / BPTT)


embed = Embedding(vocab_size=len(vocab), dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
model.w_ho.weight.data *= 0 # this seemed to help training
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

trimmed_indices = indices[0:N_BATCHES*BATCH_SIZE]
batched_indices = trimmed_indices.reshape(BATCH_SIZE, N_BATCHES)
batched_indices = batched_indices.transpose()
input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

input_batches = input_batched_indices[:N_BPTT*BPTT]
input_batches = input_batches.reshape(N_BPTT, BPTT, BATCH_SIZE)
target_batches = target_batched_indices[:N_BPTT*BPTT]
target_batches = target_batches.reshape(N_BPTT, BPTT, BATCH_SIZE)
min_loss = 1000

In [63]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 15
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()
        # m = (temp_dist > np.random.rand()).argmax() # sample from pred
        m = output.data.argmax() # take the max prediction
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

In [66]:
def train(iterations=100, min_loss=1000):
    for iter in range(iterations):
        total_loss, n_loss = (0, 0)
        hidden = model.init_hidden(batch_size=BATCH_SIZE)
        batches_to_train = len(input_batches)
        for batch_i in range(batches_to_train):
            hidden = (Tensor(hidden[0].data, autograd=True), Tensor(hidden[1].data, autograd=True))
            losses = list()
            for t in range(BPTT):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                if(t == 0):
                    losses.append(batch_loss)
                else:
                    losses.append(batch_loss + losses[-1])
            loss = losses[-1]

            loss.backward()
            optim.step()
            total_loss += loss.data / BPTT
            epoch_loss = np.exp(total_loss / (batch_i+1))
            if(epoch_loss < min_loss):
                min_loss = epoch_loss
                print()

            log = "\r Iter:" + str(iter)
            log += " - Alpha:" + str(optim.alpha)[0:5]
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Min Loss:" + str(min_loss)[0:5]
            log += " - Loss:" + str(epoch_loss)
            if(batch_i % 50 == 0):
                s = generate_sample(n=70, init_char='T').replace("\n"," ")
                log += " - " + s
            sys.stdout.write(log)

        optim.alpha *= 0.99

In [None]:
train(100)


 Iter:0 - Alpha:0.05 - Batch 3/249 - Min Loss:61.69 - Loss:61.748295982332845 -                                                                       
 Iter:0 - Alpha:0.05 - Batch 4/249 - Min Loss:61.60 - Loss:61.60517403961742
 Iter:0 - Alpha:0.05 - Batch 5/249 - Min Loss:61.29 - Loss:61.29442397158086
 Iter:0 - Alpha:0.05 - Batch 6/249 - Min Loss:60.66 - Loss:60.66382011246509
 Iter:0 - Alpha:0.05 - Batch 7/249 - Min Loss:59.58 - Loss:59.58905249944992
 Iter:0 - Alpha:0.05 - Batch 8/249 - Min Loss:57.14 - Loss:57.149968333866674
 Iter:0 - Alpha:0.05 - Batch 9/249 - Min Loss:53.31 - Loss:53.313391207757725
 Iter:0 - Alpha:0.05 - Batch 10/249 - Min Loss:51.61 - Loss:51.616747820150856
 Iter:0 - Alpha:0.05 - Batch 11/249 - Min Loss:50.18 - Loss:50.18998224999764
 Iter:0 - Alpha:0.05 - Batch 12/249 - Min Loss:48.36 - Loss:48.36974307417649
 Iter:0 - Alpha:0.05 - Batch 13/249 - Min Loss:46.02 - Loss:46.02249114726887
 Iter:0 - Alpha:0.05 - Batch 14/249 - Min Loss:45.25 - Loss:45.25822897

 Iter:0 - Alpha:0.05 - Batch 211/249 - Min Loss:19.80 - Loss:19.80500357737236
 Iter:0 - Alpha:0.05 - Batch 212/249 - Min Loss:19.76 - Loss:19.76522124477926
 Iter:0 - Alpha:0.05 - Batch 213/249 - Min Loss:19.73 - Loss:19.734901146085306
 Iter:0 - Alpha:0.05 - Batch 214/249 - Min Loss:19.70 - Loss:19.70392836302221
 Iter:0 - Alpha:0.05 - Batch 215/249 - Min Loss:19.66 - Loss:19.662634289101913
 Iter:0 - Alpha:0.05 - Batch 216/249 - Min Loss:19.62 - Loss:19.62547395883272
 Iter:0 - Alpha:0.05 - Batch 217/249 - Min Loss:19.60 - Loss:19.600982714656038
 Iter:0 - Alpha:0.05 - Batch 218/249 - Min Loss:19.56 - Loss:19.560585333546253
 Iter:0 - Alpha:0.05 - Batch 219/249 - Min Loss:19.51 - Loss:19.510858793259793
 Iter:0 - Alpha:0.05 - Batch 220/249 - Min Loss:19.48 - Loss:19.48979081792196
 Iter:0 - Alpha:0.05 - Batch 221/249 - Min Loss:19.48 - Loss:19.484358637269835
 Iter:0 - Alpha:0.05 - Batch 222/249 - Min Loss:19.45 - Loss:19.457275961644104
 Iter:0 - Alpha:0.05 - Batch 223/249 - Min Lo

 Iter:8 - Alpha:0.046 - Batch 1/249 - Min Loss:10.42 - Loss:10.919361588277546 - her Theest ates, and ther Theest ates, and ther Theest ates, and ther 
 Iter:8 - Alpha:0.046 - Batch 2/249 - Min Loss:10.37 - Loss:10.37168008432074
 Iter:8 - Alpha:0.046 - Batch 4/249 - Min Loss:10.34 - Loss:10.377784596040074
 Iter:9 - Alpha:0.045 - Batch 4/249 - Min Loss:10.25 - Loss:10.375282845952936 - her, and ther ther ther ther ther ther ther ther ther ther ther ther t w
 Iter:10 - Alpha:0.045 - Batch 1/249 - Min Loss:10.21 - Loss:10.457589321108864 - her then then then then then then then then then then then then then tw
 Iter:10 - Alpha:0.045 - Batch 2/249 - Min Loss:10.01 - Loss:10.017504429045356
 Iter:15 - Alpha:0.043 - Batch 1/249 - Min Loss:10.00 - Loss:10.046528196004962 - he seell deer Then men eneer then seer then seer then seer then seer t, 
 Iter:15 - Alpha:0.043 - Batch 249/249 - Min Loss:9.887 - Loss:10.393878428499486 - hat will will will will will will will will will will will will 

Thx