In [1]:
!pip install youtokentome
!pip install prettytable
!pip install tqdm

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
### Cell 1: ###
from collections import Counter, defaultdict
import numpy as np; np.random.seed(42)
import youtokentome as yttm
from copy import deepcopy
from tqdm import tqdm
import subprocess
import itertools
import operator
import os, sys
import random
import shutil
import json
import time
import re
import gc


### Cell 2: ###
amazon_movies = 'amazon_movies.txt'
corpus_small = 'corpus_small.csv'
corpus_small_annotated = 'corpus_small_annotated.csv'
csa_json = 'corpus_small_annotated.json'
sentences = 'sentences.txt'
bpe_model = 'bpe.model'
bpe_model_small = 'bpe10K.model'
bpe_model_medium = 'bpe20K.model'


### Cell 3: ###
def save_to_path(path, extension_of_old='.txt', extension_of_new='.csv') -> str:
    """
    Function checks if some file in path already exists and if so, it adds an
    index before the extension.
    
    Function has 2 further parameters:
        extension_of_old --> defines the extension of the original file
        extension_of_new --> defines the extension of the new file to be saved
        
    returns the new save path
    """
    
    m = re.match(f'(.+){extension_of_old}?', path)
    save_path = m.group(1) + f'{extension_of_new}'    
    j = 1
    while os.path.isfile(save_path):       
        save_path = m.group(1) + f'{j}{extension_of_new}'
        j += 1
       
    return save_path


### Cell 4: ###
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence

# check if GPU is available
train_on_gpu = torch.cuda.is_available()
#train_on_gpu = False
if train_on_gpu:
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU!')


### Cell 5: ###
with open(csa_json) as f:
    dic = json.load(f)
    sents_by_reviews = dic['sents_by_reviews']
    meta_by_reviews = dic['meta_by_reviews']
    del dic

corpus_len = len(sents_by_reviews)
array = np.arange(start=0, stop=corpus_len) # sentence indices range   

rng = np.random.default_rng(42)
train_size = int(0.98*corpus_len)
test_size = corpus_len - train_size
                 
train_set_idx = rng.choice(array, size=train_size, replace=False)

test_set_idx = np.setdiff1d(array, train_set_idx) # Unique values in array1 that are not in train_set_idx

print(f'Training set: {train_size} reviews\nTest set: {test_size} reviews')

# Some sanity checks:
assert len(train_set_idx) == train_size
assert len(test_set_idx) == test_size


### Cell 6: ###
# Getting metadata for every subword in a review:
def format_metadata(encoded, metadata):
    metadata_bysw = []
    for i, review in enumerate(encoded):
        metadata_bysw.append([])
        for j, sent in enumerate(review):
            for sw in sent:
                metadata_bysw[i].append(metadata[i][j])
                
    return metadata_bysw

# Dividing the list of sentences into train and test sets:
sents_list_train = [sents_by_reviews[idx] for idx in train_set_idx]
sents_list_test = [sents_by_reviews[idx] for idx in test_set_idx]

# Dividing the list with the metadata into train and test sets:
metadata_train = [meta_by_reviews[idx] for idx in train_set_idx]
metadata_test = [meta_by_reviews[idx] for idx in test_set_idx]

# Loading our Byte Pair Encoding model and initialising its vocabulary:
bpe = yttm.BPE(model=bpe_model)
subwords = bpe.vocab()

# Encoding the sentences in the training set: 
encoded_train = [bpe.encode(sents, output_type=yttm.OutputType.ID, eos=True) for sents in sents_list_train]
metadata_train = format_metadata(encoded_train, metadata_train)

# Flattening the list for each subword in every sentence of every review because after the metadata is collected 
# we don't need sentence boundaries represented in the list structure any more:  
encoded_train = [[sw for sent in review for sw in sent] for review in encoded_train]

# Doing the same on the test set: 
encoded_test = [bpe.encode(sents, output_type=yttm.OutputType.ID, eos=True) for sents in sents_list_test]
metadata_test = format_metadata(encoded_test, metadata_test)
encoded_test = [[sw for sent in review for sw in sent] for review in encoded_test]

print('Byte Pair Encoding for the 1st review:\n', encoded_train[0])
print('\nMetadata for the first 10 subwords of the 1st review:\n', metadata_train[0][:10])
print(f'\nSubwords: {subwords[:10]},\n{len(subwords)} subwords in total\n')


### Cell 7: ###
def get_batches(li, li_meta, batch_size=16, seq_length=160, shuffle=False):
    """
    Create a generator that returns batches of size
    batch_size x seq_length from li
    seq_length -> length of the longest sequence in batch
       
    Arguments
    ---------
    li:         List you want to make batches from
    li_meta:    List with metadata annotations of the same length as li
    batch_size: Batch size, the number of sequences per batch
    shuffle:    Whether to shuffle li, li_meta <- is not recommended because of 
                            the unresolved memory issues while padding  
    
    Output
    ---------
    x:      Inputs
    x_meta: Metadata for inputs
    y:      Targets
    """
    li_len, meta_len = len(li), len(li_meta)
    assert li_len == meta_len
    li = deepcopy(li)
    li_meta = deepcopy(li_meta)
    if shuffle:
        rng = np.random.default_rng(random.randint(0, li_len))
        set_idx = np.arange(start=0, stop=li_len)
        rng.shuffle(set_idx)
        li = [li[idx] for idx in set_idx]
        li_meta = [li_meta[idx] for idx in set_idx]
    else:        
        sort_key = lambda x: len(x)
        li.sort(key=sort_key, reverse=True)
        li_meta.sort(key=sort_key, reverse=True)
        
    total_num_revs = li_len-li_len%batch_size
    li, li_meta = li[:total_num_revs], li_meta[:total_num_revs]
    
    for bs in range(0, total_num_revs, batch_size):
        batch_revs = li[bs:bs+batch_size]
        batch_meta = li_meta[bs:bs+batch_size]
        
        #seq_lengths_revs = torch.LongTensor(list(map(len, batch_revs)))
        #seq_lengths_meta = torch.LongTensor(list(map(len, batch_meta)))
        #print(seq_lengths_revs, seq_lengths_meta)
        #return 
        
        batch_revs = [torch.tensor(rev) for rev in batch_revs] # => we get a list of L tensors of torch.Size([*])
        batch_meta = [torch.tensor(rev) for rev in batch_meta]
        batch_revs = pad_sequence(batch_revs, batch_first=True) # -> torch.Size([batch_size, longest_review])
        batch_meta = pad_sequence(batch_meta, batch_first=True)
        longest = max(batch_revs.size())

        try:
            assert batch_meta.size()[:-1] == batch_revs.size()
        except AssertionError:
            print(f'Meta: {batch_meta.size()}, Rev: {batch_revs.size()}', file=sys.stderr)
            sys.exit()
 
        if longest <= seq_length:
            a = batch_revs[:,:-1]
            b = batch_meta[:,:-1]
            c = batch_revs[:,1:]
            a_lens = (a != 0).sum(dim=1)
            yield a, b, c, a_lens
        else:
            last = longest-longest%seq_length # 160*n, 160*n - 1, idx_last = 160*n-1
            for idx in range(0, last-seq_length, seq_length):
                target_idx = idx + 1 # last idx = longest-1-seq_length
                a = batch_revs[:,idx:idx+seq_length] #.numpy()
                b = batch_meta[:,idx:idx+seq_length]
                c = batch_revs[:,target_idx:target_idx+seq_length]
                #a_lens = torch.count_nonzero(a, dim=1) pytorch 1.7
                # it is better to clamp the output later to have real sizes to fill those unclamped outputs of the LSTM hidden layer with 0s
                a_lens = (a != 0).sum(dim=1) #.clamp(min=1, max=seq_length) 
                yield a, b, c, a_lens

                    
batches = get_batches(encoded_train[:64], metadata_train[:64], batch_size=16, shuffle=False)

for e in range(1): # Making sure that batch sizes are right 
    for x, x_meta, y, bs in batches:
        s1, s2, s3 = x.size()[-1], x_meta.size()[1], y.size()[-1]
        if s1 != s2 or s2 != s3 or s1 != s3:
            print(s1, s2, s3)

# Printing out the first 10 items in a sequence:
print(f'X of {x.shape}:\n', x[:8, :10])
print(f'\nY of {y.shape}:\n', y[:8, :10])

# Printing out the first 10 metadata arrays:
print(f'\nX_meta of {x_meta.shape}:\n', x_meta[0, :10])

# Printing out the batch sizes for the last batch:
print(f'\nBatch sizes of {bs.shape}:\n', bs)


### Cell 8: ###
def predict(net, subword, metadata, h=None, temp=0.9, top_k=None, allowed_idx=None):
    """ 
    Given a subword and its metadata predict the next subword.
    Returns the predicted subword and the hidden state.
    """
    # tensor inputs:
    subword = torch.tensor([[subword]])

    if train_on_gpu:
        subword, metadata = subword.to(device='cuda', dtype=torch.long), \
                            metadata.to(device='cuda', dtype=torch.long)

    # detach hidden state from history:
    h = tuple([each.data for each in h])
    
    # get the output of the model:
    out, h = net(subword, metadata, torch.tensor([1]), h) # Batch_size = 1
    
    # get the subwords probabilities
    # apply softmax to get p probabilities for the likely next subword giving x:  
    out = out[:, allowed_idx] # size: [1, len(vocab)-len(allowed_idx)]
    p = F.softmax(out/temp, dim=1).data 
    if train_on_gpu:
        p = p.cpu() # move to cpu

    # get top subwords
    # considering the k most probable subwords with topk method:
    if top_k is None:
        top_sw = np.arange(p.size()[-1])
    else:
        p, top_sw = p.topk(top_k)
        top_sw = top_sw.numpy().squeeze()

    # select the likely next subword with some element of randomness:
    p = p.numpy().squeeze()
    subword = np.random.choice(top_sw, p=p/p.sum())

    # return the encoded value of the predicted subword and the hidden state:
    return subword, h
  

def sample(net, size=300, num_sents=5, prime='this movie', temp=0.9,
           metadata=None, top_k=None, indices_to_ignore=None):
    
    if metadata is None:
        metadata = [1, 5, 8, 11, 13, 17]
    if train_on_gpu:
        net.cuda()
    else:
        net.cpu()

    vocab_size = len(net.subwords)
    filtered_vocab_map, allowed_idx = {}, []
    if indices_to_ignore is not None:
        idx_filtered = 0        
        for idx in range(vocab_size):
            if idx not in indices_to_ignore:
                allowed_idx.append(idx)
                filtered_vocab_map[idx_filtered] = idx
                idx_filtered += 1
        allowed_idx = torch.tensor(allowed_idx)
    else:
        allowed_idx = torch.tensor(list(range(vocab_size)))
        filtered_vocab_map = {i:i for i in range(vocab_size)}
    filtered_vocab_map_inverse = {v: k for k, v in filtered_vocab_map.items()}

    # First off, run through the prime subwords:
    subwords = net.bpe.encode(prime, output_type=yttm.OutputType.ID)
    metadata = torch.tensor([[metadata]]) # Reshaping to (1, 1, 6) so that the dims correspond to those from get_batches()
    
    h = net.init_hidden(1)
    for sw in subwords:
        subword, h = predict(net, sw, metadata, h, temp=temp, top_k=top_k, allowed_idx=allowed_idx)
        subword = filtered_vocab_map[subword]
    subwords.append(subword) # appending the subword that comes after the last prime subword
    
    # Now pass in the previous subword and get a new one
    i = 0
    eos_idx = filtered_vocab_map_inverse[3] # 3 is the index of '<EOS>'
    while (subwords.count(eos_idx) < num_sents) and (i < size): # We predict 'num_sents' until 'size' is not exceeded  
        subword, h = predict(net, subwords[-1], metadata, h, temp=temp, top_k=top_k, allowed_idx=allowed_idx)
        subword = filtered_vocab_map[subword]
        subwords.append(subword)
        i += 1 
        
    #subwords = list(filter(lambda a: a != 3, subwords)) # remove all occurrences of '<EOS>' from the final output
    return net.bpe.decode(subwords)[0]


### Cell 9: ###
class SubwordRNN(nn.Module):
    
    def __init__(self, subwords=None, bpe=None, embedding_dims=None, \
                 n_hidden=1024, n_layers=2, drop_prob=0.5):
        
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        # saving the bpe-model to be able to use its encoding/decoding functions when generating some text:
        self.bpe = bpe
        self.subwords = subwords
        
        # defining 2 Embedding layers: one for subwords and another for metadata:
        if embedding_dims is None:
            embedding_dims = [256, 20]
            
        subword_emb_dim, metadata_emb_dim = embedding_dims
        
        self.sw_emb_layer = nn.Embedding(len(self.subwords), subword_emb_dim)
        self.md_emb_layer = nn.Embedding(19, metadata_emb_dim) # 18 - total number of metadata features + 1 for padded sequences
        
        embedding_dim = subword_emb_dim + 6*metadata_emb_dim
        # defining the LSTM:
        self.lstm = nn.LSTM(embedding_dim, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # define a dropout layer:
        self.dropout = nn.Dropout(drop_prob)
        
        # define the final, fully-connected output layer:
        self.fc = nn.Linear(n_hidden, len(self.subwords))
      
    
    def forward(self, x, x_meta, batch_lengths, hidden):
        """ 
        Forward pass through the network
        """
        # Creating an embedding by concatenating a subword embedding and a metadata embedding:
        subword_embeds = self.sw_emb_layer(x)
        meta_embeds = self.md_emb_layer(x_meta).reshape(*x.size(), -1)
    
        embeds = torch.cat((subword_embeds, meta_embeds), dim=2)
        
        # Clamping because pack_padded_sequence doesn't support empty batches
        batch_lengths_clamped = batch_lengths.clamp(min=1, max=batch_lengths.max().item())
        
        # Packing the sequence to avoid many senseless computations:
        packed = pack_padded_sequence(embeds, batch_lengths_clamped, batch_first=True)
        
        # Get the outputs and the new hidden state from the lstm:
        r_output_packed, hidden = self.lstm(packed, hidden)
        
        # Masking hidden, where not clamped batch_size = 0: 
        mask = (batch_lengths == 0).view(-1, 1)
        if train_on_gpu:
            mask = mask.to(device='cuda') #.to(device='cuda') #.cuda()
        hidden[0].masked_fill_(mask, 0) # Masking hidden but not the cell state! # Sure that this has to be done?
        
        # Unpack the output:
        r_output, _ = pad_packed_sequence(r_output_packed, batch_first=True)
        
        # pass through a dropout layer:
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view:
        out = out.contiguous().view(-1, self.n_hidden)
        
        # put x through the fully-connected layer:
        out = self.fc(out)       
        
        # return the final output and the hidden state:
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        """ 
        Initializes hidden state 
        """
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM:
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device='cuda'),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device='cuda'))
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden


### Cell 10: ###
def train(net, train_data, train_metadata, epochs=10, batch_size=16, 
          seq_length=160, lr=0.0001, clip=5, val_frac=0.1, print_every=1, 
          current_epoch=1, save_every=2, shuffle=False):
    """ 
    Training a network 
    
    Arguments
    ---------
        
    net: SubwordRNN network
    data: text data to train the network
    epochs: Number of epochs to train
    batch_size: Number of mini-sequences per mini-batch, aka batch size
    seq_length: Number of character steps per mini-batch
    lr: learning rate
    clip: gradient clipping
    val_frac: Fraction of data to hold out for validation
    print_every: Number of steps for printing training and validation loss   
    shuffle: Whether to shuffle the training data after each iteration
    """
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    # Create training and validation data:
    corpus_len = len(train_data)
    array = np.arange(start=0, stop=corpus_len) # review indices range   

    rng = np.random.default_rng(42)
    train_size = int((1-val_frac)*corpus_len)
    val_size = corpus_len - train_size
    train_set_idx = rng.choice(array, size=train_size, replace=False)

    val_set_idx = np.setdiff1d(array, train_set_idx) # Unique values in array1 that are not in train_set_idx
    
    data = deepcopy([train_data[idx] for idx in train_set_idx])
    metadata = deepcopy([train_metadata[idx] for idx in train_set_idx])
    val_data = deepcopy([train_data[idx] for idx in val_set_idx])    
    val_metadata = deepcopy([train_metadata[idx] for idx in val_set_idx])
    
    if train_on_gpu:
        net.to(device='cuda')
    
    counter = 0
    n_subwords = len(net.subwords)
    for e in range(current_epoch, epochs+current_epoch):
        # Initialize hidden state:
        h = net.init_hidden(batch_size)
        losses = []
        
        for inputs, meta, targets, batch_sizes in get_batches(data, metadata, batch_size, seq_length, shuffle):
            counter += 1
            
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history:
            h = tuple([each.data for each in h])
            
            if train_on_gpu:
                inputs = inputs.to(device='cuda', dtype=torch.long)
                meta = meta.to(device='cuda', dtype=torch.long)
                targets = targets.to(device='cuda', dtype=torch.long)
                
            seq_length_real_train = targets.size()[-1]

            # Zero accumulated gradients:
            net.zero_grad()
            
            # Get the output from the model:
            output, h = net(inputs, meta, batch_sizes, h)

            # Calculate the loss and perform backprop:
            loss = criterion(output, targets.reshape(batch_size*seq_length_real_train))
            losses.append(loss.item())
            loss.backward()
            
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs:
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

        # Loss stats:
        if e % print_every == 0:
            # Get validation loss:
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for x, x_meta, y, bs in get_batches(val_data, val_metadata, batch_size, seq_length, shuffle):                
                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])
                
                if train_on_gpu:
                    inputs = x.to(device='cuda', dtype=torch.long)
                    meta = x_meta.to(device='cuda', dtype=torch.long)
                    targets = y.to(device='cuda', dtype=torch.long)
                    
                seq_length_real_val = targets.size()[-1]
                output, val_h = net(inputs, meta, bs, val_h)
                val_loss = criterion(output, targets.reshape(batch_size*seq_length_real_val))
            
                val_losses.append(val_loss.item())
            
            net.train() # reset to train mode after iterating through validation data
            mean_val_loss = np.mean(val_losses)
            
            # Printing out some training statistics:
            print(f'Epoch: {e:2}/{epochs}...',
                  f'Step: {counter:6}...',
                  f'Mean_Loss: {np.mean(losses):.3f}...',
                  f'Val_Loss: {mean_val_loss:.3f}')
            
            # Generating text:
            generated = sample(net, size=300, prime='this movie', metadata=[2, 5, 7, 10, 12, 18], 
                               top_k=None, indices_to_ignore=[0, 1]) # generating text without '<PAD>', '<UNK>'          
            generated = f'Generated at epoch {e}:\n{generated}\n\n' # generated text with a header
            
            save_dir = 'Generated Text' # directory where the generated text at epoch e will be saved
            if not os.path.isdir(save_dir): # if there is no such directory, one will be created
                os.mkdir(save_dir) 
            with open(f'{save_dir}/generated_text.txt', 'a', encoding='utf-8', errors='replace') as f:
                f.write(generated)
                
        # Saving our model:        
        if e % save_every == 0 or e == epochs:
            net.eval() # VERY IMPORTANT
            
            if e >= 1 and e < 10:
                model = f'lstm_0{e}_epoch.pt'
            else:
                model = f'lstm_{e}_epoch.pt'
            
            checkpoint = {'epoch': e,
                          'n_hidden': net.n_hidden,
                          'n_layers': net.n_layers,
                          'model_state_dict': net.state_dict(),
                          'opt_state_dict': opt.state_dict(),
                          'loss': loss}

            with open(model, 'wb') as f:
                torch.save(checkpoint, f)
                
            net.train() # VERY IMPORTANT   


### Cell 11: ###
# Defining and printing out the NN properties:
embedding_dims = [256, 20] #[128, 10]
drop_prob = 0.5 #0.5
n_hidden= 1024 #1024
n_layers=3 #2
lr=0.0001 #0.001 seems to overfit

net = SubwordRNN(subwords=subwords, bpe=bpe, embedding_dims=embedding_dims, 
                 n_hidden=n_hidden, n_layers=n_layers, drop_prob=drop_prob)
print(net)

batch_size = 64 #16 
seq_length = 160
n_epochs = 10 #2

# Training the model:
#train(net, encoded_train, metadata_train, epochs=n_epochs, 
      #batch_size=batch_size, seq_length=seq_length, lr=lr, 
      #print_every=1, save_every=5, current_epoch=1)

Training on GPU!
Training set: 97998 reviews
Test set: 2000 reviews
Byte Pair Encoding for the 1st review:
 [217, 2855, 482, 126, 170, 4118, 29044, 86, 1253, 767, 115, 340, 898, 186, 482, 126, 170, 1304, 142, 84, 1018, 1627, 86, 764, 1738, 1527, 505, 3, 87, 2237, 800, 950, 593, 305, 5367, 126, 3810, 186, 86, 6560, 120, 276, 867, 86, 530, 5713, 3974, 109, 4156, 2666, 329, 4065, 4266, 3, 86, 170, 192, 17095, 448, 3, 310, 7066, 305, 170, 87, 4120, 120, 87, 510, 1021, 689, 493, 1650, 6225, 1943, 35, 3777, 87, 14839, 2159, 11504, 24, 245, 92, 456, 354, 109, 980, 26, 3, 21504, 6457, 19121, 87, 9748, 27, 2061, 2078, 16363, 3, 86, 4122, 319, 120, 5681, 1112, 138, 444, 109, 14173, 1233, 4821, 297, 1988, 3, 315, 84, 292, 4587, 87, 3623, 113, 87, 1954, 3185, 493, 1552, 3185, 170, 26510, 7196, 3]

Metadata for the first 10 subwords of the 1st review:
 [[1, 5, 8, 11, 12, 17], [1, 5, 8, 11, 12, 17], [1, 5, 8, 11, 12, 17], [1, 5, 8, 11, 12, 17], [1, 5, 8, 11, 12, 17], [1, 5, 8, 11, 12, 17], [1, 5, 8,

In [7]:
path_to_model = 'lstm_10_epoch_lr0001.pt'

checkpoint = torch.load(path_to_model)
net.load_state_dict(checkpoint['model_state_dict'])

print(net)
#train(net, encoded_train, metadata_train, epochs=n_epochs, batch_size=batch_size, 
      #seq_length=seq_length, lr=lr, print_every=1, save_every=1, current_epoch=7)

SubwordRNN(
  (sw_emb_layer): Embedding(30469, 256)
  (md_emb_layer): Embedding(19, 20)
  (lstm): LSTM(376, 1024, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=1024, out_features=30469, bias=True)
)


In [8]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(net)

+---------------------+------------+
|       Modules       | Parameters |
+---------------------+------------+
| sw_emb_layer.weight |  7800064   |
| md_emb_layer.weight |    380     |
|  lstm.weight_ih_l0  |  1540096   |
|  lstm.weight_hh_l0  |  4194304   |
|   lstm.bias_ih_l0   |    4096    |
|   lstm.bias_hh_l0   |    4096    |
|  lstm.weight_ih_l1  |  4194304   |
|  lstm.weight_hh_l1  |  4194304   |
|   lstm.bias_ih_l1   |    4096    |
|   lstm.bias_hh_l1   |    4096    |
|  lstm.weight_ih_l2  |  4194304   |
|  lstm.weight_hh_l2  |  4194304   |
|   lstm.bias_ih_l2   |    4096    |
|   lstm.bias_hh_l2   |    4096    |
|      fc.weight      |  31200256  |
|       fc.bias       |   30469    |
+---------------------+------------+
Total Trainable Params: 61567361


61567361

In [9]:
metadata_experiments = [[2, 5, 8, 11, 12, 17], 
                        [1, 5, 8, 11, 12, 17], 
                        [2, 3, 8, 11, 12, 17], 
                        [2, 5, 6, 11, 12, 17], 
                        [2, 5, 8, 10, 12, 17], 
                        [2, 5, 8, 11, 15, 17], 
                        [2, 5, 8, 11, 12, 18]]

for cur_meta in metadata_experiments:
    generated = sample(net, size=300, num_sents=2, prime='this movie', temp=0.6,
                       metadata=cur_meta, top_k=None)
    generated = f'Metadata: {" ".join([str(el) for el in cur_meta])}\n{generated}\n\n'
    with open(f'experiments.txt', 'a', encoding='utf-8', errors='replace') as f:
        f.write(generated)

In [5]:
def perplexity(net, encoded_test, metadata_test, batch_size=16):
    
    #criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
    
    if train_on_gpu:
        net.cuda()
    else:
        net.cpu()
        
    net.eval()    
    h = net.init_hidden(batch_size) 
    logs_sum, N = 0, 0
    
    for x, x_meta, y, bs in get_batches(encoded_test, metadata_test):                

        h = tuple([each.data for each in h])

        if train_on_gpu:
            inputs = x.to(device='cuda', dtype=torch.long)
            meta = x_meta.to(device='cuda', dtype=torch.long)
            targets = y.to(device='cuda', dtype=torch.long)

        seq_length_real_val = targets.size()[-1]
        N += bs.sum().item()
        output, h = net(inputs, meta, bs, h)
        loss = F.cross_entropy(output, targets.reshape(batch_size*seq_length_real_val), 
                               ignore_index=0, reduction='sum')
        logs_sum += loss.item()
        
    prplxt = np.exp(logs_sum/N)
    
    return prplxt

print(f'Perplexity of the trained model: {perplexity(net, encoded_test, metadata_test):.2f}')

Perplexity of the trained model: 501.74
