In [1]:
!pip install youtokentome
!pip install prettytable
!pip install tqdm

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
### Cell 1: ###
from collections import Counter, defaultdict
import numpy as np; np.random.seed(42)
import youtokentome as yttm
from copy import deepcopy
from tqdm import tqdm
import pandas as pd
import subprocess
import itertools
import operator
import os, sys
import random
import shutil
import json
import time
import re
import gc


### Cell 2: ###
amazon_movies = 'amazon_movies.txt'
corpus_small = 'corpus_small.csv'
corpus_small_annotated = 'corpus_small_annotated.csv'
csa_json = 'corpus_small_annotated.json'
sentences = 'sentences.txt'
bpe_model = 'bpe.model'
bpe_model_small = 'bpe10K.model'
bpe_model_medium = 'bpe20K.model'


### Cell 3: ###
def save_to_path(path, extension_of_old='.txt', extension_of_new='.csv') -> str:
    """
    Function checks if some file in path already exists and if so, it adds an
    index before the extension.
    
    Function has 2 further parameters:
        extension_of_old --> defines the extension of the original file
        extension_of_new --> defines the extension of the new file to be saved
        
    returns the new save path
    """
    
    m = re.match(f'(.+){extension_of_old}?', path)
    save_path = m.group(1) + f'{extension_of_new}'    
    j = 1
    while os.path.isfile(save_path):       
        save_path = m.group(1) + f'{j}{extension_of_new}'
        j += 1
        
    return save_path


### Cell 4: ###
import torch
from torch import nn
import torch.nn.functional as F

# check if GPU is available
train_on_gpu = torch.cuda.is_available()
#train_on_gpu = False
if train_on_gpu:
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU!')


### Cell 5: ###
sentences_df = pd.read_csv(corpus_small_annotated, sep='\t', header=0, encoding='utf-8', warn_bad_lines=True)
sentences_df['Sentence'] = sentences_df['Sentence'].astype(str).str.lower()
sentences_meta = sentences_df.iloc[:,2:]

sents_list = sentences_df['Sentence'].tolist() # Converting the column with sentences into a list of strings

print(sentences_df.head(3))

corpus_len = len(sents_list)
array = np.arange(start=0, stop=corpus_len) # sentence indices range   

rng = np.random.default_rng(42)
train_size = int(0.98*corpus_len)
test_size = corpus_len - train_size
                 
train_set_idx = rng.choice(array, size=train_size, replace=False)

test_set_idx = np.setdiff1d(array, train_set_idx) # Unique values in array1 that are not in train_set_idx

print(f'Training set: {train_size} sents\nTest set: {test_size} sents')

# Some sanity checks:
assert len(train_set_idx) == train_size
assert len(test_set_idx) == test_size


### Cell 6: ###
# Dividing the list of sentences into train/test/dev sets:
sents_list_train = [sents_list[idx] for idx in train_set_idx]
sents_list_test = [sents_list[idx] for idx in test_set_idx]

# Converting the metadata-part of sentences_df dataframe into a list of lists
metadata = sentences_meta.values.tolist()

# Dividing the list with the metadata into train/test/dev sets:
metadata_train = [metadata[idx] for idx in train_set_idx]
metadata_test = [metadata[idx] for idx in test_set_idx]

# Loading our Byte Pair Encoding model and initialising its vocabulary:
bpe = yttm.BPE(model=bpe_model)
subwords = bpe.vocab()

# Encoding the sentences in the training set: 
encoded_train = bpe.encode(sents_list_train, output_type=yttm.OutputType.ID, eos=True)
# Repeating the sentence metadata for every single subword in the sentence: 
metadata_train = np.array([metadata_train[i] for i in range(len(encoded_train)) for _ in range(len(encoded_train[i]))])
encoded_train = np.array([id_code for sent in encoded_train for id_code in sent]) # flattening the array of id codes

# Encoding the sentences in the test set: 
encoded_test = bpe.encode(sents_list_test, output_type=yttm.OutputType.ID, eos=True)
# Repeating the sentence metadata for every single subword in the sentence: 
metadata_test = np.array([metadata_test[i] for i in range(len(encoded_test)) for _ in range(len(encoded_test[i]))])
encoded_test = np.array([id_code for sent in encoded_test for id_code in sent]) # flattening the array of id codes

print('Byte Pair Encoding:', encoded_train[:10])
print(f'\nSubwords: {subwords[:10]},\n{len(subwords)} subwords in total\n')

print('Metadata for every subword:\n', metadata_train[:10])


### Cell 7: ###
def get_batches(arr, arr_meta, batch_size, seq_length):
    """
    Create a generator that returns batches of size
    batch_size x seq_length from arr.
       
    Arguments
    ---------
    arr: Array you want to make batches from
    arr_meta: Array with metadata annotations of the same lenght as arr 
    batch_size: Batch size, the number of sequences per batch
    seq_length: Number of encoded subwords in a sequence
    """
    
    assert len(arr) == len(arr_meta)
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make, // integer division, round down:
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough subwords to make full batches:
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows, n. of first row is the batch size, the other length is inferred:
    arr = arr.reshape((batch_size, -1))
    
    # Doing the same with the metadata:
    arr_meta = arr_meta[:n_batches * batch_size_total].reshape((batch_size, -1, 6)) # Each subword has 6 metadata features
    
    # iterate through the array, one sequence at a time:
    for n in range(0, arr.shape[1], seq_length):
        # The features:
        x = arr[:, n:n+seq_length]
        # Metadata:
        x_meta = arr_meta[:, n:n+seq_length]
        # The targets, shifted to the right by one:
        y = np.zeros_like(x)
        
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, x_meta, y 
        
batches = get_batches(encoded_train, metadata_train, 8, 50)
x, x_meta, y = next(batches)

# Printing out the first 10 items in a sequence:
print(f'X of shape {x.shape}:\n', x[:8, :10])
print(f'\nY of shape {y.shape}:\n', y[:8, :10])

# Printing out the first 10 metadata arrays:
print(f'\nX_meta of shape {x_meta.shape}:\n', x_meta[0, :10])

# Checking if the reshapes make sense, idx of 2nd batch = n_batches * batch_size_total / batch_size:
print('\nFirst 50 elements in the 1st batch are equal to original values:', np.array_equal(x_meta[0, :50], metadata_train[:50]))


### Cell 8: ###
def predict(net, subword, metadata, h=None, temp=0.9, top_k=None):
    """ 
    Given a subword and its metadata predict the next subword.
    Returns the predicted subword and the hidden state.
    """
        
    # tensor inputs:
    x = np.array([[subword]])
    inputs, meta = torch.from_numpy(x), \
                   torch.from_numpy(metadata)

    if train_on_gpu:
        inputs, meta = inputs.to(device='cuda', dtype=torch.long), \
                       meta.to(device='cuda', dtype=torch.long)

    # detach hidden state from history:
    h = tuple([each.data for each in h])
    
    # get the output of the model:
    out, h = net(inputs, meta, h)

    # get the subwords probabilities
    # apply softmax to get p probabilities for the likely next subword giving x:
    p = F.softmax(out/temp, dim=1).data
    if train_on_gpu:
        p = p.cpu() # move to cpu

    # get top subwords
    # considering the k most probable subwords with topk method:
    if top_k is None:
        top_sw = np.arange(len(net.subwords))
    else:
        p, top_sw = p.topk(top_k)
        top_sw = top_sw.numpy().squeeze()

    # select the likely next subword with some element of randomness:
    p = p.numpy().squeeze()
    subword = np.random.choice(top_sw, p=p/p.sum())

    # return the encoded value of the predicted subword and the hidden state:
    return subword, h
  

def sample(net, size=300, num_sents=5, prime='this movie', temp=0.9,
           metadata=[1, 5, 8, 11, 13, 17], top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    # First off, run through the prime subwords:
    subwords = net.bpe.encode(prime, output_type=yttm.OutputType.ID)
    metadata = np.array([[metadata]]) # Reshaping to (1, 1, 6) so that the dims correspond to those from get_batches()
    
    h = net.init_hidden(1)
    for sw in subwords:
        subword, h = predict(net, sw, metadata, h, temp=temp, top_k=top_k)

    subwords.append(subword) # appending the subword that comes after the last prime subword
    
    # Now pass in the previous subword and get a new one
    i = 0
    while (subwords.count(3) < num_sents) and (i < size): # 3 is the index of '<EOS>'
        subword, h = predict(net, subwords[-1], metadata, h, temp=temp, top_k=top_k)
        subwords.append(subword)
        i += 1 
        
    subwords = list(filter(lambda a: a != 3, subwords)) # remove all occurrences of '<EOS>' from the final output
    return net.bpe.decode(subwords)[0]


### Cell 9: ###
class SubwordRNN(nn.Module):
    
    def __init__(self, subwords=None, bpe=None, embedding_dims=None, \
                 n_hidden=1024, n_layers=2, drop_prob=0.5):
        
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        # saving the bpe-model to be able to use its encoding/decoding functions when generating some text:
        self.bpe = bpe
        self.subwords = subwords
        
        # defining 2 Embedding layers: one for subwords and another for metadata:
        if embedding_dims is None:
            embedding_dims = [256, 20]
            
        subword_emb_dim, metadata_emb_dim = embedding_dims
        
        self.subword_embeds = nn.Embedding(len(self.subwords), subword_emb_dim)
        self.meta_embeds = nn.Embedding(19, metadata_emb_dim) # 18 - total number of metadata features
        
        embedding_dim = subword_emb_dim + 6*metadata_emb_dim
        # defining the LSTM:
        self.lstm = nn.LSTM(embedding_dim, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # define a dropout layer:
        self.dropout = nn.Dropout(drop_prob)
        
        # define the final, fully-connected output layer:
        self.fc = nn.Linear(n_hidden, len(self.subwords))
        
    
    def forward(self, x, x_meta, hidden):
        """ 
        Forward pass through the network. 
        These inputs are x, and the hidden/cell state `hidden`. 
        """
        # Creating an embedding by concatenating a subword embedding and a metadata embedding:
        subword_embeds = self.subword_embeds(x)
        meta_embeds = self.meta_embeds(x_meta).view(*x.size(), -1)
    
        embeds = torch.cat((subword_embeds, meta_embeds), dim=2)
        
        # Get the outputs and the new hidden state from the lstm:
        r_output, hidden = self.lstm(embeds, hidden)
        
        # pass through a dropout layer:
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view:
        out = out.contiguous().view(-1, self.n_hidden)
        
        # put x through the fully-connected layer:
        out = self.fc(out)       
        
        # return the final output and the hidden state:
        return out, hidden    
    
    def init_hidden(self, batch_size):
        """ 
        Initializes hidden state 
        """
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM:
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device='cuda'),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device='cuda'))
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden


### Cell 10: ###
def train(net, data, metadata, epochs=10, batch_size=10, seq_length=50, lr=0.001, 
          clip=5, val_frac=0.1, print_every=1, current_epoch=1, save_every=10):
    """ 
    Training a network 
    
    Arguments
    ---------
        
    net: SubwordRNN network
    data: text data to train the network
    epochs: Number of epochs to train
    batch_size: Number of mini-sequences per mini-batch, aka batch size
    seq_length: Number of character steps per mini-batch
    lr: learning rate
    clip: gradient clipping
    val_frac: Fraction of data to hold out for validation
    print_every: Number of steps for printing training and validation loss    
    """
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # Create training and validation data:
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    metadata, val_metadata = metadata[:val_idx], metadata[val_idx:]
    
    if train_on_gpu:
        net.to(device='cuda')
    
    counter = 0
    n_subwords = len(net.subwords)
    for e in range(current_epoch, epochs+current_epoch):
        # Initialize hidden state:
        h = net.init_hidden(batch_size)
        losses = []
        
        for x, x_meta, y in get_batches(data, metadata, batch_size, seq_length):
            counter += 1
            
            # Convert our data to Torch tensors:
            inputs, meta, targets = torch.from_numpy(x), \
                                    torch.from_numpy(x_meta), \
                                    torch.from_numpy(y)
            
            if train_on_gpu:
                inputs, meta, targets = inputs.to(device='cuda', dtype=torch.long), \
                                        meta.to(device='cuda', dtype=torch.long), \
                                        targets.to(device='cuda', dtype=torch.long)
        
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history:
            h = tuple([each.data for each in h])

            # Zero accumulated gradients:
            net.zero_grad()
            
            # Get the output from the model:
            output, h = net(inputs, meta, h)
            
            # Calculate the loss and perform backprop:           
            loss = criterion(output, targets.view(batch_size*seq_length))
            losses.append(loss.item())
            loss.backward()
            
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs:
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

        # Loss stats:
        if e % print_every == 0:
            # Get validation loss:
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for x, x_meta, y in get_batches(val_data, val_metadata, batch_size, seq_length):
                # Convert our data to Torch tensors:
                x, x_meta, y = torch.from_numpy(x), \
                               torch.from_numpy(x_meta), \
                               torch.from_numpy(y)
                
                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])
                
                inputs, meta, targets = x, x_meta, y
                if train_on_gpu:
                    inputs, meta, targets = inputs.to(device='cuda', dtype=torch.long), \
                                            meta.to(device='cuda', dtype=torch.long), \
                                            targets.to(device='cuda', dtype=torch.long)

                output, val_h = net(inputs, meta, val_h)
                val_loss = criterion(output, targets.view(batch_size*seq_length))
            
                val_losses.append(val_loss.item())
            
            net.train() # reset to train mode after iterating through validation data
            mean_val_loss = np.mean(val_losses)
            
            # Printing out some training statistics:
            print(f'Epoch: {e:2}/{epochs}...',
                  f'Step: {counter:6}...',
                  f'Mean_Loss: {np.mean(losses):.3f}...',
                  f'Val_Loss: {mean_val_loss:.3f}')
            
            # Generating text:
            generated = sample(net, size=300, prime='this movie', temp=0.9,
                               metadata=[2, 5, 7, 10, 12, 18], top_k=None) # generating text            
            generated = f'Generated at epoch {e}:\n{generated}\n\n' # generated text with a header
            
            save_dir = 'Generated Text' # directory where the generated text at epoch e will be saved
            if not os.path.isdir(save_dir): # if there is no such directory, one will be created
                os.mkdir(save_dir) 
            with open(f'{save_dir}/generated_text.txt', 'a', encoding='utf-8', errors='replace') as f:
                f.write(generated)
                
        # Saving our model:        
        if e % save_every == 0:
            net.eval() # VERY IMPORTANT
            
            if e >= 1 and e < 10:
                model = f'lstm_0{e}_epoch.pt'
            else:
                model = f'lstm_{e}_epoch.pt'
            
            checkpoint = {'epoch': e,
                          'n_hidden': net.n_hidden,
                          'n_layers': net.n_layers,
                          'model_state_dict': net.state_dict(),
                          'opt_state_dict': opt.state_dict(),
                          'loss': loss}

            with open(model, 'wb') as f:
                torch.save(checkpoint, f)
                
            net.train() # VERY IMPORTANT   


### Cell 11: ###
# Defining and printing out the NN properties:
embedding_dims = [256, 20] #[128, 10]
drop_prob = 0.5 #0.5
n_hidden= 1024 #1024
n_layers=3 #3
lr=0.0001 #0.001 seems to overfit

net = SubwordRNN(subwords=subwords, bpe=bpe, embedding_dims=embedding_dims, 
                 n_hidden=n_hidden, n_layers=n_layers, drop_prob=drop_prob)
print(net)

batch_size = 32 #64 
seq_length = 160 #160 
n_epochs = 10 #10

# Training the model: 
# use encoded_train[:100000], metadata_train[:100000] for debugging!
#train(net, encoded_train, metadata_train, epochs=n_epochs, batch_size=batch_size, 
      #seq_length=seq_length, lr=lr, print_every=1, save_every=1, current_epoch=1) # trained in Google Colaboratory

Training on GPU!
   Review                                           Sentence  Professional  \
0       0  i have the doo wop 00 and 00 dvds, and was anx...             1   
1       0  from the first video, which featured the crme ...             1   
2       0  better lighting, better stage layout, and bett...             1   

   Sentiment  Length  Personal  Theme  Descriptive  
0          5       8        11     16           17  
1          5       8        10     14           17  
2          5       7        10     15           18  
Training set: 817378 sents
Test set: 16682 sents
Byte Pair Encoding: [ 276  198 2080  113 3625  577 8717   27  109   87]

Subwords: ['<PAD>', '<UNK>', '<BOS>', '<EOS>', '▁', 'e', 't', 'a', 'i', 'o'],
30469 subwords in total

Metadata for every subword:
 [[ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 12 17]
 [ 2  5  7 10 1

In [3]:
path_to_model = 'lstm_10_epoch.pt'

checkpoint = torch.load(path_to_model)
net.load_state_dict(checkpoint['model_state_dict'])

print(net)
#train(net, encoded_train, metadata_train, epochs=n_epochs, batch_size=batch_size, 
      #seq_length=seq_length, lr=lr, print_every=1, save_every=1, current_epoch=7)

SubwordRNN(
  (subword_embeds): Embedding(30469, 256)
  (meta_embeds): Embedding(19, 20)
  (lstm): LSTM(376, 1024, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=1024, out_features=30469, bias=True)
)


In [4]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(net)

+-----------------------+------------+
|        Modules        | Parameters |
+-----------------------+------------+
| subword_embeds.weight |  7800064   |
|   meta_embeds.weight  |    380     |
|   lstm.weight_ih_l0   |  1540096   |
|   lstm.weight_hh_l0   |  4194304   |
|    lstm.bias_ih_l0    |    4096    |
|    lstm.bias_hh_l0    |    4096    |
|   lstm.weight_ih_l1   |  4194304   |
|   lstm.weight_hh_l1   |  4194304   |
|    lstm.bias_ih_l1    |    4096    |
|    lstm.bias_hh_l1    |    4096    |
|   lstm.weight_ih_l2   |  4194304   |
|   lstm.weight_hh_l2   |  4194304   |
|    lstm.bias_ih_l2    |    4096    |
|    lstm.bias_hh_l2    |    4096    |
|       fc.weight       |  31200256  |
|        fc.bias        |   30469    |
+-----------------------+------------+
Total Trainable Params: 61567361


61567361

In [6]:
metadata_experiments = [[2, 5, 8, 11, 12, 17], 
                        [1, 5, 8, 11, 12, 17], 
                        [2, 3, 8, 11, 12, 17], 
                        [2, 5, 6, 11, 12, 17], 
                        [2, 5, 8, 10, 12, 17], 
                        [2, 5, 8, 11, 15, 17], 
                        [2, 5, 8, 11, 12, 18]]

for cur_meta in metadata_experiments:
    generated = sample(net, size=300, num_sents=2, prime='this movie', temp=0.6,
                       metadata=cur_meta, top_k=None)
    generated = f'Metadata: {" ".join([str(el) for el in cur_meta])}\n{generated}\n\n'
    with open(f'experiments.txt', 'a', encoding='utf-8', errors='replace') as f:
        f.write(generated)

In [13]:
def perplexity(net, encoded_test, metadata_test, batch_size=16, seq_length=160):
    
    #criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
    
    if train_on_gpu:
        net.cuda()
    else:
        net.cpu()
        
    net.eval()    
    h = net.init_hidden(batch_size) 
    logs_sum, N = 0, 0
    
    for x, x_meta, y in get_batches(encoded_test, metadata_test, 16, 160):                

        h = tuple([each.data for each in h])

        if train_on_gpu:
            inputs = torch.from_numpy(x).to(device='cuda', dtype=torch.long)
            meta = torch.from_numpy(x_meta).to(device='cuda', dtype=torch.long)
            targets = torch.from_numpy(y).to(device='cuda', dtype=torch.long)

        N += batch_size*seq_length
        output, h = net(inputs, meta, h)
        loss = F.cross_entropy(output, targets.reshape(batch_size*seq_length), 
                               ignore_index=0, reduction='sum')
        logs_sum += loss.item()
        
    prplxt = np.exp(logs_sum/N)
    
    return prplxt

print(f'Perplexity of the trained model: {perplexity(net, encoded_test, metadata_test):.2f}')

Perplexity of the trained model: 161.68
