# Learning to Write Like Shakespeare
## Long-Short Term Memory

## Section 14.1

In [1]:
import sys,random,math
from collections import Counter
import numpy as np
import sys
np.random.seed(0)

f = open('datasets/shakespear.txt','r')
# from http://karpathy.github.io/2015/05/21/rnn-effectiveness/
raw = f.read()
f.close()
vocab = list(set(raw))
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
indices = np.array(list(map(lambda x:word2index[x], raw)))

## Framework Classes

In [2]:
import numpy as np

class Tensor (object):
    def __init__(self,data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creators = creators
        self.creation_op = creation_op
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.is_recurrent = False
        if(id is None):
            id = np.random.randint(0,100000)
        self.id = id
        if(creators is not None):
            for c in creators:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
   
    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True
   
    def backward(self,grad=None, grad_origin=None):
        if(self.autograd):
            
            if(grad is None):
                grad = Tensor(np.ones_like(self.data))
            
            if(grad_origin is not None):
                if (self.children[grad_origin.id] == 0):
                    return
                else:
                    self.children[grad_origin.id] -= 1
            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad
                
            if(self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None)):
                if(self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                if(self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())
                if(self.creation_op == "sub"):
                    new = Tensor(self.grad.data)
                    self.creators[0].backward(new, self)
                    new = Tensor(self.grad.__neg__().data)
                    self.creators[1].backward(new, self)
                if(self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)
                if(self.creation_op == "mm"):
                    act = self.creators[0] # usually an activation
                    weights = self.creators[1] # usually a weight matrix
                    new = self.grad.mm(weights.transpose())
                    act.backward(new)
                    new = self.grad.transpose().mm(act).transpose()
                    weights.backward(new)
                if(self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())
                if("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    ds = self.creators[0].data.shape[dim]
                    self.creators[0].backward(self.grad.expand(dim,ds))
                if("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                if(self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                if(self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                if(self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))
                if(self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))
    
    def __add__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data + other.data, autograd=True, creators=[self,other], creation_op="add")
        return Tensor(self.data + other.data)
   
    def __neg__(self):
        if(self.autograd):
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __sub__(self, other):
        if(self.autograd and other.autograd): 
            return Tensor(self.data - other.data, autograd=True, creators=[self,other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data * other.data, autograd=True, creators=[self,other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def __truediv__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data / other.data, autograd=True, creators=[self,other], creation_op="div")
        return Tensor(self.data / other.data)

    def sum(self, dim):
        if(self.autograd):
            return Tensor(self.data.sum(dim), autograd=True, creators=[self], creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))

    def expand(self, dim,copies):
        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape)
        new_data = new_data.transpose(trans_cmd)
        if(self.autograd):
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        if(self.autograd):
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())

    def mm(self, x):
        if(self.autograd):
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self,x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if(self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)),
            autograd=True,
            creators=[self],
            creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if(self.autograd):
            return Tensor(np.tanh(self.data),
            autograd=True,
            creators=[self],
            creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    
    def softmax(self):
        e_x = np.exp(self.data - np.max(self.data))
        return e_x / e_x.sum(axis=0)
    
    def index_select(self, indices):
        if(self.autograd):
            new = Tensor(self.data[indices.data],
            autograd=True,
            creators=[self],
            creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])
    
    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
        axis=len(self.data.shape)-1,
        keepdims=True)
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)
    
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()
        if(self.autograd):
            out = Tensor(loss,
            autograd=True,
            creators=[self],
            creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out
        return Tensor(loss)
        
    def __repr__(self):
        return str(self.data.__repr__())
    def __str__(self):
        return str(self.data.__str__())

class Layer(object):
    def __init__(self):
        self.parameters = list()
    def get_parameters(self):
        return self.parameters

class Linear(Layer):
    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.parameters.append(self.weight)
        if bias:
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)
            self.parameters.append(self.bias)
        else:
            self.bias = None
    
    def forward(self, input):
        temp = input.mm(self.weight)
        if self.bias:
            temp += self.bias.expand(0,len(input.data))
        return temp
    
class Embedding(Layer):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.dim = dim
        # this initialiation style is just a convention from word2vec
        weight = (np.random.rand(vocab_size, dim) - 0.5) / dim
        self.weight = Tensor(weight, autograd=True)
        self.parameters.append(self.weight)
    def forward(self, input):
         return self.weight.index_select(input)
    
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()
    def forward(self, input, target):
        return input.cross_entropy(target)

class RNNCell(Layer):
    def __init__(self, n_inputs,n_hidden,n_output,activation='sigmoid'):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        if(activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif(activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")
        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()
        
    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size,self.n_hidden)),autograd=True)

class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    def forward(self, input):
        return input.sigmoid()

class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
    def step(self, zero=True):
        for p in self.parameters:
            p.data -= p.grad.data * self.alpha
            if(zero):
                p.grad.data *= 0

In [3]:
embed = Embedding(vocab_size=len(vocab),dim=512)
model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

## Setion 14.3

In [4]:
batch_size = 32
bptt = 16
n_batches = int((indices.shape[0] / (batch_size)))

In [5]:
trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()
input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]
n_bptt = int(((n_batches-1) / bptt))
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt,bptt,batch_size)
target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)

In [6]:
print(raw[0:5])
print(indices[0:5])

That,
[52 46  3 58 45]


In [7]:
print(batched_indices[0:5])

[[52  9 61 57 51 46 52  6  6  3  3 59 51  6 45 24  6 25 46 61 59  3  6  6
   6 40  6 40 40  6  6  4]
 [46 24 24 57  2 59  3 35 25  4 37 59 37 58  6 24 15 51 51 24 37 14 15 34
  46  4 28  0  4  0  5  6]
 [ 3 24 49 59 59  3 37 12 51  6 37 61 34 46 17 23 59 35 37 24 34 59 56 59
  59 30 51 45  6 51 51 15]
 [58 48 52 56  6 56 15  6 35  4  6  6  9 59 59 16 59 45 25 29 12  6 51 59
  56 37 35 24 19 56  0 51]
 [45 16 40 49 17  6 51 56  6 51 17 41 24  6  6 38  4  6  6 51  9 25 17  2
  45  3 37 16 35 59 59 25]]


In [8]:
print(input_batches[0][0:5])
print(target_batches[0][0:5])

[[52  9 61 57 51 46 52  6  6  3  3 59 51  6 45 24  6 25 46 61 59  3  6  6
   6 40  6 40 40  6  6  4]
 [46 24 24 57  2 59  3 35 25  4 37 59 37 58  6 24 15 51 51 24 37 14 15 34
  46  4 28  0  4  0  5  6]
 [ 3 24 49 59 59  3 37 12 51  6 37 61 34 46 17 23 59 35 37 24 34 59 56 59
  59 30 51 45  6 51 51 15]
 [58 48 52 56  6 56 15  6 35  4  6  6  9 59 59 16 59 45 25 29 12  6 51 59
  56 37 35 24 19 56  0 51]
 [45 16 40 49 17  6 51 56  6 51 17 41 24  6  6 38  4  6  6 51  9 25 17  2
  45  3 37 16 35 59 59 25]]
[[46 24 24 57  2 59  3 35 25  4 37 59 37 58  6 24 15 51 51 24 37 14 15 34
  46  4 28  0  4  0  5  6]
 [ 3 24 49 59 59  3 37 12 51  6 37 61 34 46 17 23 59 35 37 24 34 59 56 59
  59 30 51 45  6 51 51 15]
 [58 48 52 56  6 56 15  6 35  4  6  6  9 59 59 16 59 45 25 29 12  6 51 59
  56 37 35 24 19 56  0 51]
 [45 16 40 49 17  6 51 56  6 51 17 41 24  6  6 38  4  6  6 51  9 25 17  2
  45  3 37 16 35 59 59 25]
 [ 6 42 12 34 40 59 58 59 17  6 59 46 24 34 17  7  6  0 15 12 24 51  6  6
   6 17 34  4 57

## Section 14.4

In [9]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 10 # temperature for sampling, higher=greedier
        temp_dist = output.softmax()
        temp_dist = temp_dist / temp_dist.sum()
        #m = (temp_dist > np.random.rand()).argmax() # sample from pred
        m = output.data.argmax() # take max of predictio
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

def train(iterations=100):
    for iter in range(iterations):
        total_loss = 0
        n_loss = 0
        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):
            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            losses = list()
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if(t == 0):
                    loss = batch_loss
                else:
                    loss = loss + batch_loss

            for loss in losses:
                ""
                    
            loss.backward()
            optim.step()
            total_loss += loss.data
            log = "\r Iter:" + str(iter)
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Loss:" + str(np.exp(total_loss / (batch_i+1)))
            if(batch_i == 0):
                log += " - " + generate_sample(70,'\n').replace("\n"," ")
            if(batch_i % 10 == 0 or batch_i-1 == len(input_batches)):
                sys.stdout.write(log)
        optim.alpha *= 0.99
        print()

In [10]:
train()

 Iter:0 - Batch 191/195 - Loss:90.449239911586244                                                                      
 Iter:1 - Batch 191/195 - Loss:20.894818315295982 a  th thtt thett tt tt tt tt tt tt tt tt tt tt tt tt tt tt tt tt tt tt
 Iter:2 - Batch 191/195 - Loss:15.613584632136503 a the the the the the the the the the the the the the the the the the 
 Iter:3 - Batch 191/195 - Loss:13.226728456720428d tot the the the the the the the the the the the the the the the the 
 Iter:4 - Batch 191/195 - Loss:11.892883950508265dh thert the the the the the the the the the the the the the the the t
 Iter:5 - Batch 191/195 - Loss:11.026551482998261dh thert the the the the the the the the the the the the the the the t
 Iter:6 - Batch 191/195 - Loss:10.374155134352871dh the the the the the the the the the the the the the the the the the
 Iter:7 - Batch 191/195 - Loss:9.8484359957554789dath the the the the the the the the the the the the the the the the t
 Iter:8 - Batch 191/195 - Loss:9.39489

 Iter:67 - Batch 191/195 - Loss:1.2636609041260494 I have the nammalt fellow, Camy hot comporth, ard, and she with eyed h
 Iter:68 - Batch 191/195 - Loss:1.2388008479948125I am my hear ared himmst Lo my fachain firt, in hemphim that I will si
 Iter:69 - Batch 191/195 - Loss:1.2308689428653058 I have the nabe I nammath, and say would nothere it is, amather but ma
 Iter:70 - Batch 191/195 - Loss:1.2190525630831861 I have them this sto But ke That ever have stringmen I man!  HENRE: Th
 Iter:71 - Batch 191/195 - Loss:1.1952278926978983 am thy tamn the mand?  ARGLAPlook sonithou this sto But ke That ever
 Iter:72 - Batch 191/195 - Loss:1.2030174664801154 I am the mant what makes: She waser, sir, now, but I will suchims ar l
 Iter:73 - Batch 191/195 - Loss:1.1855843008832223 I am the mand?  ARGLAPlET: YORI himy hath ser, my hot and I with ey wa
 Iter:74 - Batch 191/195 - Loss:1.1845803311950502 I am the mant wot in the profets of thy tamn the mand?  ARGLAPlook son
 Iter:75 - Batch 191/195 - 

## Section 14.5

In [11]:
print(generate_sample(n=2000, init_char='\n'))

I all that I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I conow, to ther
But shakes:
Seak:
That I will light for the nace she you he would brod man, have neave name, and I c

## Section 14.7

In [12]:
(sigmoid,relu)=(lambda x:1/(1+np.exp(-x)), lambda x:(x>0).astype(float)*x)
weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1,0.01]))
print("Sigmoid Activations")
activations = list()
for iter in range(10):
    activation = sigmoid(activation.dot(weights))
    activations.append(activation)
    print(activation)

print("\nSigmoid Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = (activation * (1 - activation) * gradient)
    gradient = gradient.dot(weights.transpose())
    print(gradient)

print("Activations")
activations = list()
for iter in range(10):
    activation = relu(activation.dot(weights))
    activations.append(activation)
    print(activation)

print("\nGradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = ((activation > 0) * gradient).dot(weights.transpose())
    print(gradient)

Sigmoid Activations
[0.93940638 0.96852968]
[0.9919462  0.99121735]
[0.99301385 0.99302901]
[0.9930713  0.99307098]
[0.99307285 0.99307285]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]

Sigmoid Gradients
[0.03439552 0.03439552]
[0.00118305 0.00118305]
[4.06916726e-05 4.06916726e-05]
[1.39961115e-06 1.39961115e-06]
[4.81403643e-08 4.81403637e-08]
[1.65582672e-09 1.65582765e-09]
[5.69682675e-11 5.69667160e-11]
[1.97259346e-12 1.97517920e-12]
[8.45387597e-14 8.02306381e-14]
[1.45938177e-14 2.16938983e-14]
Activations
[4.8135251  4.72615519]
[23.71814585 23.98025559]
[119.63916823 118.852839  ]
[595.05052421 597.40951192]
[2984.68857188 2977.61160877]
[14895.13500696 14916.36589628]
[74560.59859209 74496.90592414]
[372548.22228863 372739.30029248]
[1863505.42345854 1862932.18944699]
[9315234.18124649 9316953.88328115]

Gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15625.]
[78125. 78125.]


## Section 14.10

In [13]:
class LSTMCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)
        self.xc = Linear(n_inputs, n_hidden)
        self.hf = Linear(n_hidden, n_hidden, bias=False)
        self.hi = Linear(n_hidden, n_hidden, bias=False)
        self.ho = Linear(n_hidden, n_hidden, bias=False)
        self.hc = Linear(n_hidden, n_hidden, bias=False)
        self.w_ho = Linear(n_hidden, n_output, bias=False)
        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()
        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()
        self.parameters += self.ho.get_parameters()
        self.parameters += self.hc.get_parameters()
        self.parameters += self.w_ho.get_parameters()
    def forward(self, input, hidden):
        prev_hidden = hidden[0]
        prev_cell = hidden[1]
        f=(self.xf.forward(input)+self.hf.forward(prev_hidden)).sigmoid()
        i=(self.xi.forward(input)+self.hi.forward(prev_hidden)).sigmoid()
        o=(self.xo.forward(input)+self.ho.forward(prev_hidden)).sigmoid()
        g = (self.xc.forward(input) +self.hc.forward(prev_hidden)).tanh()
        c = (f * prev_cell) + (i * g)
        h = o * c.tanh()
        h.is_recurrent = True
        output = self.w_ho.forward(h)
        return output, (h, c)
    def init_hidden(self, batch_size=1):
        h = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        c = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        h.data[:,0] += 1
        c.data[:,0] += 1
        return (h, c)

## Section 14.11

In [14]:
import sys,random,math
from collections import Counter
import numpy as np
import sys
np.random.seed(0)

f = open('datasets/shakespear.txt','r')
raw = f.read()
f.close()
vocab = list(set(raw))
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
indices = np.array(list(map(lambda x:word2index[x], raw)))
embed = Embedding(vocab_size=len(vocab),dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
model.w_ho.weight.data *= 0 # this seemed to help training
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(),alpha=0.05)
batch_size = 16
bptt = 25
n_batches = int((indices.shape[0] / (batch_size)))
trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()
input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]
n_bptt = int(((n_batches-1) / bptt))
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt,bptt,batch_size)
target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)
min_loss = 1000

## Section 14.12

In [15]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 15
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()
        # m = (temp_dist > np.random.rand()).argmax() # sample from pred
        m = output.data.argmax() # take the max prediction
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s


In [16]:
def train(iterations=100, min_loss=1000):
    for iter in range(iterations):
        total_loss, n_loss = (0, 0)
        hidden = model.init_hidden(batch_size=batch_size)
        batches_to_train = len(input_batches)
        for batch_i in range(batches_to_train):
            hidden = (Tensor(hidden[0].data, autograd=True), Tensor(hidden[1].data, autograd=True))
            losses = list()
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                if(t == 0):
                    losses.append(batch_loss)
                else:
                    losses.append(batch_loss + losses[-1])
            loss = losses[-1]

            loss.backward()
            optim.step()
            total_loss += loss.data / bptt
            epoch_loss = np.exp(total_loss / (batch_i+1))
            if(epoch_loss < min_loss):
                min_loss = epoch_loss
                print()

            log = "\r Iter:" + str(iter)
            log += " - Alpha:" + str(optim.alpha)[0:5]
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Min Loss:" + str(min_loss)[0:5]
            log += " - Loss:" + str(epoch_loss)
            if(batch_i % 50 == 0):
                s = generate_sample(n=70, init_char='T').replace("\n"," ")
                log += " - " + s
            sys.stdout.write(log)

        optim.alpha *= 0.99

In [17]:
train(100)


 Iter:0 - Alpha:0.05 - Batch 1/249 - Min Loss:62.00 - Loss:62.000000000000064 -                      eeee  eeee  eeee  eee  eeee  eee  eee  eee  eeee 
 Iter:0 - Alpha:0.05 - Batch 2/249 - Min Loss:61.99 - Loss:61.999195513086285
 Iter:0 - Alpha:0.05 - Batch 3/249 - Min Loss:61.98 - Loss:61.984910016040885
 Iter:0 - Alpha:0.05 - Batch 4/249 - Min Loss:61.95 - Loss:61.95759317523022
 Iter:0 - Alpha:0.05 - Batch 5/249 - Min Loss:61.90 - Loss:61.90392927051005
 Iter:0 - Alpha:0.05 - Batch 6/249 - Min Loss:61.79 - Loss:61.79490938754441
 Iter:0 - Alpha:0.05 - Batch 7/249 - Min Loss:61.60 - Loss:61.60023379161083
 Iter:0 - Alpha:0.05 - Batch 8/249 - Min Loss:61.11 - Loss:61.1156884335555
 Iter:0 - Alpha:0.05 - Batch 9/249 - Min Loss:60.06 - Loss:60.069593271418235
 Iter:0 - Alpha:0.05 - Batch 10/249 - Min Loss:58.60 - Loss:58.60326515002265
 Iter:0 - Alpha:0.05 - Batch 11/249 - Min Loss:56.11 - Loss:56.11448262460074
 Iter:0 - Alpha:0.05 - Batch 12/249 - Min Loss:53.88 - Loss:53.88762998082

 Iter:0 - Alpha:0.05 - Batch 107/249 - Min Loss:26.01 - Loss:26.01443369180936
 Iter:0 - Alpha:0.05 - Batch 108/249 - Min Loss:25.92 - Loss:25.923093969517172
 Iter:0 - Alpha:0.05 - Batch 109/249 - Min Loss:25.84 - Loss:25.840435036424903
 Iter:0 - Alpha:0.05 - Batch 110/249 - Min Loss:25.74 - Loss:25.74601448371527
 Iter:0 - Alpha:0.05 - Batch 111/249 - Min Loss:25.63 - Loss:25.635654470407317
 Iter:0 - Alpha:0.05 - Batch 112/249 - Min Loss:25.53 - Loss:25.539407537322294
 Iter:0 - Alpha:0.05 - Batch 113/249 - Min Loss:25.44 - Loss:25.44783482988351
 Iter:0 - Alpha:0.05 - Batch 114/249 - Min Loss:25.38 - Loss:25.387315404997516
 Iter:0 - Alpha:0.05 - Batch 115/249 - Min Loss:25.36 - Loss:25.36250574321817
 Iter:0 - Alpha:0.05 - Batch 116/249 - Min Loss:25.29 - Loss:25.29250471021678
 Iter:0 - Alpha:0.05 - Batch 117/249 - Min Loss:25.21 - Loss:25.21591590393694
 Iter:0 - Alpha:0.05 - Batch 118/249 - Min Loss:25.12 - Loss:25.120892852664728
 Iter:0 - Alpha:0.05 - Batch 119/249 - Min Los

 Iter:0 - Alpha:0.05 - Batch 210/249 - Min Loss:20.40 - Loss:20.402615723799737
 Iter:0 - Alpha:0.05 - Batch 211/249 - Min Loss:20.34 - Loss:20.34783610156164
 Iter:0 - Alpha:0.05 - Batch 212/249 - Min Loss:20.30 - Loss:20.30214444560546
 Iter:0 - Alpha:0.05 - Batch 213/249 - Min Loss:20.26 - Loss:20.26682238038663
 Iter:0 - Alpha:0.05 - Batch 214/249 - Min Loss:20.23 - Loss:20.232305846644827
 Iter:0 - Alpha:0.05 - Batch 215/249 - Min Loss:20.18 - Loss:20.187011667752035
 Iter:0 - Alpha:0.05 - Batch 216/249 - Min Loss:20.14 - Loss:20.146346829590534
 Iter:0 - Alpha:0.05 - Batch 217/249 - Min Loss:20.11 - Loss:20.11928420597005
 Iter:0 - Alpha:0.05 - Batch 218/249 - Min Loss:20.07 - Loss:20.074548484059285
 Iter:0 - Alpha:0.05 - Batch 219/249 - Min Loss:20.01 - Loss:20.01951378900323
 Iter:0 - Alpha:0.05 - Batch 220/249 - Min Loss:19.99 - Loss:19.991943801041593
 Iter:0 - Alpha:0.05 - Batch 221/249 - Min Loss:19.98 - Loss:19.982170326866118
 Iter:0 - Alpha:0.05 - Batch 222/249 - Min Lo

 Iter:3 - Alpha:0.048 - Batch 213/249 - Min Loss:11.98 - Loss:11.984975117559008
 Iter:3 - Alpha:0.048 - Batch 214/249 - Min Loss:11.98 - Loss:11.98268269133983
 Iter:3 - Alpha:0.048 - Batch 215/249 - Min Loss:11.97 - Loss:11.971769072520187
 Iter:3 - Alpha:0.048 - Batch 217/249 - Min Loss:11.96 - Loss:11.964181720342618
 Iter:3 - Alpha:0.048 - Batch 218/249 - Min Loss:11.95 - Loss:11.957087928976922
 Iter:3 - Alpha:0.048 - Batch 223/249 - Min Loss:11.94 - Loss:11.946467339765988
 Iter:3 - Alpha:0.048 - Batch 224/249 - Min Loss:11.94 - Loss:11.94044622734134
 Iter:3 - Alpha:0.048 - Batch 225/249 - Min Loss:11.93 - Loss:11.936476877724955
 Iter:3 - Alpha:0.048 - Batch 236/249 - Min Loss:11.93 - Loss:11.934189138386955
 Iter:4 - Alpha:0.048 - Batch 2/249 - Min Loss:11.93 - Loss:12.161941588790109 - hend theseres, and theseres, and theseres, and theseres, and theseres,
 Iter:4 - Alpha:0.048 - Batch 3/249 - Min Loss:11.84 - Loss:11.84283678209681
 Iter:4 - Alpha:0.048 - Batch 4/249 - Min L

 Iter:8 - Alpha:0.046 - Batch 241/249 - Min Loss:10.38 - Loss:10.38366921872776
 Iter:8 - Alpha:0.046 - Batch 242/249 - Min Loss:10.38 - Loss:10.382002119118807
 Iter:9 - Alpha:0.045 - Batch 3/249 - Min Loss:10.37 - Loss:10.456776943391993 - heres, and seerer Theat ther ther ther ther ther ther ther ther ther t
 Iter:9 - Alpha:0.045 - Batch 4/249 - Min Loss:10.35 - Loss:10.35865455757589
 Iter:9 - Alpha:0.045 - Batch 226/249 - Min Loss:10.33 - Loss:10.336671017005118- hat wille wille ife, wille ife, wille ife, wille ife, wille ife, willeW
 Iter:9 - Alpha:0.045 - Batch 227/249 - Min Loss:10.33 - Loss:10.330250209152277
 Iter:9 - Alpha:0.045 - Batch 230/249 - Min Loss:10.32 - Loss:10.330106131007579
 Iter:9 - Alpha:0.045 - Batch 233/249 - Min Loss:10.32 - Loss:10.331929677786029
 Iter:9 - Alpha:0.045 - Batch 234/249 - Min Loss:10.32 - Loss:10.32551844199101
 Iter:9 - Alpha:0.045 - Batch 235/249 - Min Loss:10.32 - Loss:10.320058567110847
 Iter:9 - Alpha:0.045 - Batch 236/249 - Min Loss:10

In [18]:
print(generate_sample(n=500, init_char='\n'))

And will of will o will lo will  of in the with ir wiove lould will lou did, will oull will lo will po with to lord;
And will in to love or will o will lood in the willl o will o will will o will not will lould will o will por trumpos all doul of in to loud will lou in the would bo with to will lood Kor will do will lo doud I do and will loullould bre will not pos of to with the wor will lo dou in of ir, with will lo did will not will look you will look you will look you will lould will o will p
