## Homework III Frederik Chettouh

In [0]:
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F
from torch import optim

from google.colab import files


In [2]:
text=files.upload()

Saving anna.txt to anna.txt


In [0]:
with open ('anna.txt') as file:
    text=file.read()

In [0]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

In [0]:
encoded=np.array([char2int[char] for char in text])

In [0]:
def one_hot_encode(encoded,length):
    start_array=np.zeros((encoded.size,length),dtype=np.float32)
    start_array[np.arange(start_array.shape[0]),encoded.flatten()]=1.
    start_array=start_array.reshape((*encoded.shape,length))
    return start_array

In [0]:
sparse_matrix=one_hot_encode(encoded,83)

In [8]:
encoded.shape

(1985223,)

In [0]:
def get_batches(arr, batch_size, seq_length):
#     number of total chars=N*M*K-->K=Chars/N*M
    chars_per_batch=batch_size*seq_length
    n_batches=arr.shape[0]//chars_per_batch
    chars_to_keep=chars_per_batch*n_batches
    
    arr=arr[:chars_to_keep]
    arr=arr.reshape(batch_size,-1)
    
    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y


In [0]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [12]:
one_hot_encode(x, 83).shape

(8, 50, 83)

## Defining the network

In [13]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


The LSTM architecture is as follows:
- every character is passed to a two layer hidden Neural Network 
- The two layers receive the character and a hidden state (previous output before being passed to the softmax)--> this is why in the architecture we have to return the hidden state
- it will serve as the output at the next iteration 
- Hidden state has always the same shape since it is not passed to the softmax
- At every stage there is an output


### Open Question:
Explain the dimensions of the hidden state
why are they different from the output?
Does "creating" new hidden/cell state mean that we only utilize the information for one batch ?

In [0]:
class LSTM(nn.Module):
    """Tokens is the number unique characters in the corpus"""
    
    def __init__(self,
                 tokens,
                 hidden_size=512,
                 num_layers=2,
                 dropout=0.2,
                 lr=0.001
                
                ):
        super().__init__()
        
        self.chars=tokens
        self.input_size=len(tokens)
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.dropout=dropout
        
        
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.lstm=nn.LSTM(self.input_size,
                          self.hidden_size,
                          self.num_layers,
                          dropout=self.dropout,
                          batch_first=True)
        self.dropout=nn.Dropout(p=0.2)
        self.fc1=nn.Linear(self.hidden_size, self.input_size)
        
        
    def forward(self,x, hidden):
      x,self.hidden=self.lstm(x,hidden)

      # shape of x is (batchsize,seq lenght and 2)--> why two?
      # 

#         output has the same shape as hidden
#         since the next layer is a linear fully connected layer the input has to be an array
#         therefore the output has to be stacked
      x=self.dropout(x)
      x=x.contiguous().view(-1, self.hidden_size)
      output=self.fc1(x)
        
      return output, hidden
  
#     This comes straight from the pytorch documentation and seems to be pretty standard
    def init_hidden(self, batch_size):
      weight=next(self.parameters()).data
      if (train_on_gpu):
        # hidden has both the long and short term memory
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda(),
                  weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda())
      else:
          hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                    weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        
      return hidden
      

Creating a function that saves the model parameters--> I will need this for the optional parts

In [0]:
def save_model(current_epoch, model,batch_size, seq_length):
  model_name = 'rnn_' + str(current_epoch) + '_epoch.net'
  checkpoint = {'hidden_size': model.hidden_size,
              'n_layers': model.num_layers,
              'batch_size': batch_size,
              'sequence_length': seq_length,
              'state_dict': model.state_dict(),
              'chars': model.chars}
 
  with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)
    
  print('successfully saved model')

Next we train the network--> lookout for the cellstate hidden state variables. 

In [0]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=100, save=False):
    val_loss_tracker = []

    net.train()
    
    optimizer=optim.Adam(net.parameters(), lr=lr)
#     not having used softmax on the output we can use crossentropyloss here 
    criterion=nn.CrossEntropyLoss()
    
    train_size=int(len(data)*(1-val_frac))
    train_data,vali_data=data[:train_size],data[train_size:]
    
    
    if(train_on_gpu):
        net.cuda()
        
    counter=0
    n_chars=len(net.chars) #how many different characters are there
    
    for e in range(epochs):
        hidden=net.init_hidden(batch_size)
        
#         we first get the batches of the characters
#         then we one hot encode them
        for x,y in get_batches(train_data, batch_size, seq_length):
          counter+=1
            
          x=one_hot_encode(x,n_chars)
          inputs, targets= torch.from_numpy(x), torch.from_numpy(y)
            
          if(train_on_gpu):
            inputs, targets = inputs.cuda(), targets.cuda()
#           detaching hidden state
          hidden=tuple([var.data for var in hidden])
    
          net.zero_grad()
        

#         here the entire sequence is processed in one go--> I believe
          output,hidden=net(inputs, hidden)
        
          loss=criterion(output, targets.view(batch_size*seq_length))
          loss.backward()
            
          nn.utils.clip_grad_norm_(net.parameters(),clip)
          optimizer.step()
        else:
          if save:
            save_model(e, net, seq_length, batch_size)

          hidden_val = net.init_hidden(batch_size)
          val_losses = []
          net.eval()

          for x,y in get_batches(vali_data, batch_size, seq_length):

            x=one_hot_encode(x,n_chars)
            inputs, targets= torch.from_numpy(x), torch.from_numpy(y)
            hidden_val=tuple([var.data for var in hidden_val])
                # send data to GPU
            if(train_on_gpu):
              inputs, targets = inputs.cuda(), targets.cuda()

            output,hidden_val=net(inputs, hidden_val)    
            val_loss=criterion(output, targets.view(batch_size*seq_length))
            val_losses.append(val_loss.item())

          net.train()
          
          print("Epoch: {}/{}...".format(e+1, epochs),
                "Step: {}...".format(counter),
                "Loss: {:.4f}...".format(loss.item()),
                "Val Loss: {:.4f}".format(np.mean(val_losses)))
          val_loss_tracker.append(np.mean(val_losses))
          if len(val_loss_tracker)>2 and val_loss_tracker[-1]>(val_loss_tracker[-2]+val_loss_tracker[-3])/2:
            print('Stopping due to early stopping criterion')
            print(f'load model rnn_{str(e)}_epoch.net')
            return e



In [102]:
#setting a few hyperparamters
n_hidden=1024
n_layers=3
net=LSTM(chars,n_hidden,n_layers)
print(net)

LSTM(
  (lstm): LSTM(83, 1024, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=1024, out_features=83, bias=True)
)


In [105]:
n_epochs=10
batch_size=128
seq_length=100
model_to_loadtrain=train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=100, save=True)


successfully saved model
Epoch: 1/80... Step: 139... Loss: 3.0881... Val Loss: 3.0643
successfully saved model
Epoch: 2/80... Step: 278... Loss: 2.1528... Val Loss: 2.1658
successfully saved model
Epoch: 3/80... Step: 417... Loss: 1.7579... Val Loss: 1.7662
successfully saved model
Epoch: 4/80... Step: 556... Loss: 1.5607... Val Loss: 1.5763
successfully saved model
Epoch: 5/80... Step: 695... Loss: 1.4504... Val Loss: 1.4764
successfully saved model
Epoch: 6/80... Step: 834... Loss: 1.3834... Val Loss: 1.4193
successfully saved model
Epoch: 7/80... Step: 973... Loss: 1.3270... Val Loss: 1.3781
successfully saved model
Epoch: 8/80... Step: 1112... Loss: 1.2953... Val Loss: 1.3498
successfully saved model
Epoch: 9/80... Step: 1251... Loss: 1.2633... Val Loss: 1.3275
successfully saved model
Epoch: 10/80... Step: 1390... Loss: 1.2396... Val Loss: 1.3181
successfully saved model
Epoch: 11/80... Step: 1529... Loss: 1.2149... Val Loss: 1.3065
successfully saved model
Epoch: 12/80... Step: 1

## Predicting new characters

We Take all possible characters in and start with a hidden state of None
Then we predict the most likely character
At the next step, we predict the most likely character given the one before/


In [0]:
def predict_char(net,char,hidden=None,top_k=None):
  # character gets encoded with its numerical value
  # next we create the one vector for this one char
  inputs=np.array([[net.char2int[char]]])
  inputs=one_hot_encode(inputs, len(net.chars))
  inputs=torch.from_numpy(inputs)
  if train_on_gpu:
    inputs=inputs.cuda()
  
  hidden=tuple([var.data for var in hidden])

  output,hidden=net(inputs,hidden)
  probability=F.softmax(output,dim=1).data
  if(train_on_gpu):
    probability = probability.cpu() 
  if top_k is None:
    # getting the top probabilities and the top characters not all of them
    top_ch=np.arange(len(net.chars))
  else:
    probability,top_ch=probability.topk(top_k)
    # turning tensor into numpy array
    top_ch=top_ch.numpy().squeeze()

  probability=probability.numpy().squeeze()
  char=np.random.choice(top_ch, p=probability/probability.sum())
    # p is the rescalded proability --> unchanged if top_k=None
  return net.int2char[char], hidden

In [0]:
def sample(net, size, prime='The', top_k=None):
  if(train_on_gpu):
    net.cuda()
  else:
    net.cpu()
  net.eval() # eval mode

  chars = [ch for ch in prime]
  hidden = net.init_hidden(1)
  for ch in prime:
    char, hidden = predict_char(net, ch, hidden, top_k=top_k)

  chars.append(char)

  for ii in range(size):
    char, hidden = predict_char(net, chars[-1], hidden, top_k=top_k)
    chars.append(char)

  return ''.join(chars)

In [108]:
print(sample(net, 1000, prime='Anna', top_k=10))

Annan, tethed, s in ting,"Saledrataphanssored.
Aleas witeevillofore andove hethenends saig sasoveshe ashed f otingeshin heryorerase t t whig crid foon as t br,"Wheay hes s aserean omo owat sitha thit thest omelasevetond ofand sht he ten co he wing an tedirorain iteved aimpad tonot sinsh, as fa f t she hay..
ounean frirono cait as ad

aite can wing tone ovede t hend ttath t he t womer feleret fallerouns and as atanofon in to helly t heron comiot ofofonorthay tererimaimen ourse thaidontevid tisthindovendang
tirowheve ane aind alyedisind s a ont h a stheen sthas thinghathisa s hant, arely, het fasiotindithimyore ced thof thed bled she htt toug, war sass wit alles bofatheseland, bund funtiof iof orond ion hiouped in t tas th hindoonoumotathan ore bure othowist iout he s ape f chtin say sus seranot allye herovowe s f berd inthe outy oro hitin itofediche the ce aliryoutinge athaine at heantord fulealon f a helighe an at hrad heralor ss benetot ano thiritone t ay thedeareell ste be we wal t h

### Loading the best model



In [0]:
with open(f'rnn_{model_to_load}_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
 
loaded = CharLSTM_extended(batch_size=checkpoint['batch_size'],seq_length=checkpoint['seq_length'], 
                           chars=checkpoint['chars'],
                           hidden_dim=checkpoint['hidden_dim'], 
                           n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])