In [1]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /Users/dvdblk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Imports

import nltk
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from argparse import Namespace



### References

* https://iksinc.online/tag/continuous-bag-of-words-cbow/
* http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_II_The_Continuous_Bag-of-Words_Model.pdf
* https://stackoverflow.com/questions/48479915/what-is-the-preferred-ratio-between-the-vocabulary-size-and-embedding-dimension
* https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py
* https://stackoverflow.com/questions/50792316/what-does-1-mean-in-pytorch-view
* https://www.tensorflow.org/tutorials/text/word_embeddings
* https://pytorch.org/docs/stable/nn.html
* https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
* https://github.com/ChristophAlt/embedding_vectorizer/blob/master/embedding_vectorizer.py
* https://pytorch.org/tutorials/beginner/saving_loading_models.html

## Vocabulary

In [3]:
import nltk
class Vocabulary():
    def __init__(self, add_unk=True):
        super(Vocabulary, self).__init__()
        
        self._token_to_ids = {}
        self._ids_to_token = {}
        
        if add_unk:
            self.unk_index = self.add_token("<UNK>") 

    
    def vocabulary_set(self):
        """this function returns a list of unique tokens"""
        return(list(set(self.tokens)))
    
    def make_dicts(self):
        unique_tokens = list(set(self.tokens))
        tok_to_ix = {}
        ix_to_tok = {}
        for i in range(len(unique_tokens)):
            tok_to_ix.update({unique_tokens[i]: i})
            ix_to_tok.update({i: unique_tokens[i]})
        return tok_to_ix, ix_to_tok
    
    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_ids:
            index = self._token_to_ids[token]
        else:
            index = len(self._token_to_ids)
            self._token_to_ids[token] = index
            self._ids_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_ids.get(token, self.unk_index)
        else:
            return self._token_to_ids[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._ids_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._ids_to_token[index]

    def __len__(self):
        return len(self._token_to_ids)
        

## Vectorizer

In [4]:
class Vectorizer(object):
    def __init__(self, vocabulary):
        self.vocab = vocabulary
        
    @classmethod
    def from_dataframe(cls, cbow_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            cbow_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the Vectorizer
        """
        vocabulary = Vocabulary()
        for index, row in cbow_df.iterrows():
            # add each context word (token) to the vocabulary
            for token in row.context:
                vocabulary.add_token(token)
                
            # add the target word as well
            vocabulary.add_token(row.target)
            
        return cls(vocabulary)
    
    def vectorize(self, context_words):
        context_ids = [self.vocab.lookup_token(w) for w in context_words]
        return torch.tensor(context_ids, dtype=torch.long)


## Dataset

In [5]:
class ShakespeareDataset(Dataset):
    def __init__(self, cbow_df):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
        """
        # 98/1/1% split
        self.train_df, self.val_df, self.test_df = \
          np.split(cbow_df, [int(.98*len(cbow_df)), int(.99*len(cbow_df))])

        self._lookup_dict = {'train': self.train_df,
                             'val': self.val_df,
                             'test': self.test_df}

        self.set_split()
        self._vectorizer = Vectorizer.from_dataframe(self.train_df)

    @classmethod
    def load_and_create_dataset(cls, filepath, context_size, frac=1.0):
        """Load and preprocess the dataset
        
        Args:
            filepath (str): location of the dataset
            context_size (int): size of the context before/after the target word
            frac (float, optional): fraction of the data to use (default 1.0)
        Returns:
            an instance of ShakespeareDataset
        """
        # load the file
        lines = ShakespeareDataset._load_file(filepath)
        # consider the fraction param and throw away the rest
        lines = lines[:int(len(lines)*frac)]
        
        # Preprocess
        tokens = ShakespeareDataset._preprocess_and_split_lines(lines)
        
        # Create DataFrame
        dataframe_data = ShakespeareDataset._create_context_data(
            tokens, 
            context_size
        )
        cbow_df = pd.DataFrame(dataframe_data, columns=['context', 'target'])
        
        # Create an instance 
        return cls(cbow_df)
    
    @staticmethod
    def _load_file(filepath):
        """Load the dataset file into lines"""
        with open(filepath) as file:
            lines = file.readlines()
            file.close()
            return lines
    
    @staticmethod
    def _preprocess_and_split_lines(lines):
        """
        
        Args:
            lines (list): a list of lines of the dataset
        Returns:
            a list of tokens
        """
        
        # Regex
        lines = lines[134:164924] #these numbers are only valid for the full corpus
        text = ''.join(lines)
        text = re.sub(r'\d+', '', text)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        
        # Tokenize
        tokens = nltk.tokenize.word_tokenize(text)
        #tokens = text.split()
        
        return tokens
    
    @staticmethod
    def _create_context_data(tokens, context_size):
        data = []
        for i in range(context_size, len(tokens) - context_size):
            # Context before w_i
            context_before_w = tokens[i - context_size: i]

            # Context after w_i
            context_after_w = tokens[i + 1: i + context_size + 1]

            # Put them together
            context_window = context_before_w + context_after_w

            # Target = w_i
            target = tokens[i]

            # Append in the correct format
            data.append([context_window, target])
        return data

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
        
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_df = self._lookup_dict[split]

    def __len__(self):
        return len(self._target_df)

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        context_vector = self._vectorizer.vectorize(row.context)
        target_index = self._vectorizer.vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## CBOW

In [6]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, nr_hidden_neurons=128):
        super(CBOW, self).__init__()
        self._context_window_size = context_size * 2
        
        # Embedding/input layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Hidden layer
        self.linear1 = nn.Linear(embedding_dim, nr_hidden_neurons) 

        # Output layer 
        self.linear2 = nn.Linear(nr_hidden_neurons, vocab_size)

        
    def forward(self, inputs):
        # shape = (WINDOW_SIZE, EMBEDDING_DIM) -> (EMBEDDING_DIM)
        embeds = self.embeddings(inputs).sum(dim=1)
        
        # finally compute the hidden layer weighted sum (a.k.a. output before using the activation function)
        # ... and don't forget to divide by the number of input vectors
        h =  self.linear1(embeds) / self._context_window_size
        
        # output of the hidden layer
        out =  F.relu(h)
         
        # output
        # also note that we don't compute softmax here because Cross Entropy is used as a loss function
        out = F.relu(self.linear2(out))
        return out

---
## Training

In [7]:
class TrainState:

    def __init__(self, filename):
        self.epoch_index = 0
        self.train_loss = []
        self.val_loss = []
        self.model_filename = filename


    def update(self, model):
        """Handle the training state updates.

        model (nn.Module): model to save
        """
        # Save one model at least once
        if self.epoch_index == 0:
            torch.save(model.state_dict(), self.model_filename)

        # Save model if performance improved
        else:
            loss_prev, loss_cur = self.val_loss[-2:]

            # compare current loss with the previous one
            if loss_cur <= loss_prev:
              # save if needed
              torch.save(model.state_dict(), self.model_filename)

In [20]:
args = Namespace(
    # Data and Path information
    shakespeare_csv_filepath="shakespeare-corpus.txt",
    model_state_file="shakespeare_model.pth",
    # Model hyper parameters
    context_size=2,
    num_neurons=128,
    embedding_dim=50,
    # Training hyper parameters
    seed=1337,
    num_epochs=40,
    learning_rate=0.001,
    batch_size=32,
    # Runtime options
    cuda=True
)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))


# Set seed for reproducibility
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed_all(args.seed)

class CBOWTrainingRoutine:

    def create_new_classifier(self, vocab_len, embedding_dim, context_size,
                              nr_hidden_neurons, device, learning_rate,
                              filepath):
      # Classifier
      self.loss_func = nn.CrossEntropyLoss()
      classifier = CBOW(
          vocab_len, 
          embedding_dim, 
          context_size, 
          nr_hidden_neurons)
      self.classifier = classifier.to(device)
      self.optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)

      filename = str(nr_hidden_neurons) + "_" + str(learning_rate) + "_" + filepath
      self.train_state = TrainState(filename)


    def train(self, dataset, num_epochs, batch_size, device):
      for epoch_index in tqdm(range(num_epochs)):
          self.train_state.epoch_index = epoch_index

          # Iterate over training dataset

          # setup: batch generator, set loss to 0, set train mode on

          dataset.set_split('train')
          batch_generator = generate_batches(dataset, 
                                            batch_size=batch_size, 
                                            device=device)
          running_loss = 0.0
          self.classifier.train()

          for batch_index, batch_dict in enumerate(batch_generator):
              # the training routine is these 5 steps:

              # --------------------------------------
              # step 1. zero the gradients
              self.optimizer.zero_grad()

              # step 2. compute the output
              y_pred = self.classifier(batch_dict['x_data'])

              # step 3. compute the loss
              loss = self.loss_func(y_pred, batch_dict['y_target'])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)

              # step 4. use loss to produce gradients
              loss.backward()

              # step 5. use optimizer to take gradient step
              self.optimizer.step()
              # -----------------------------------------

          self.train_state.train_loss.append(running_loss)

          # Iterate over val dataset

          # setup: batch generator, set loss to 0; set eval mode on
          dataset.set_split('val')
          batch_generator = generate_batches(dataset, 
                                            batch_size=batch_size, 
                                            device=device)
          running_loss = 0.0
          self.classifier.eval()

          for batch_index, batch_dict in enumerate(batch_generator):

              # compute the output
              y_pred =  self.classifier(batch_dict['x_data'])

              # compute the loss
              loss = self.loss_func(y_pred, batch_dict['y_target'])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)

          self.train_state.val_loss.append(running_loss)

          self.train_state.update(model=self.classifier)
      

Using CUDA: False


In [21]:
# Dataset
dataset = ShakespeareDataset.load_and_create_dataset(
    args.shakespeare_csv_filepath,
    args.context_size,
    0.01
)

vectorizer = dataset.get_vectorizer()

training_routine = CBOWTrainingRoutine()

In [22]:
import itertools

grid_search_params = {
      "lr": [0.001, 0.01],
      "nr_hidden_neurons": [2, 15, 30, 50, 100]
}


values = [lists for _, lists in grid_search_params.items()]
classifiers = []

for lr, nr_hidden in itertools.product(*values):
    print("Training " + str(nr_hidden) + " " + str(lr))
    training_routine.create_new_classifier(
      len(vectorizer.vocab), args.embedding_dim, 
      args.context_size, nr_hidden, 
      args.device, lr, args.model_state_file
    )

    training_routine.train(
      dataset, 
      args.num_epochs,
      args.batch_size,
      args.device,
    )

    classifiers.append((training_routine.classifier, lr))




  0%|          | 0/40 [00:00<?, ?it/s][A[A[A

Training 2 0.001





  2%|▎         | 1/40 [00:02<01:53,  2.90s/it][A[A[A


  5%|▌         | 2/40 [00:06<01:54,  3.02s/it][A[A[A


  8%|▊         | 3/40 [00:09<01:49,  2.97s/it][A[A[A


 10%|█         | 4/40 [00:11<01:43,  2.89s/it][A[A[A


 12%|█▎        | 5/40 [00:14<01:41,  2.90s/it][A[A[A


 15%|█▌        | 6/40 [00:17<01:38,  2.90s/it][A[A[A


 18%|█▊        | 7/40 [00:20<01:33,  2.85s/it][A[A[A


 20%|██        | 8/40 [00:23<01:33,  2.91s/it][A[A[A


 22%|██▎       | 9/40 [00:26<01:33,  3.02s/it][A[A[A


 25%|██▌       | 10/40 [00:29<01:30,  3.00s/it][A[A[A


 28%|██▊       | 11/40 [00:32<01:24,  2.92s/it][A[A[A


 30%|███       | 12/40 [00:35<01:23,  3.00s/it][A[A[A


 32%|███▎      | 13/40 [00:38<01:21,  3.04s/it][A[A[A


 35%|███▌      | 14/40 [00:41<01:19,  3.05s/it][A[A[A


 38%|███▊      | 15/40 [00:44<01:16,  3.06s/it][A[A[A


 40%|████      | 16/40 [00:47<01:13,  3.07s/it][A[A[A


 42%|████▎     | 17/40 [00:50<01:10,  3.08s/it][A[A[A


 45

Training 15 0.001





  2%|▎         | 1/40 [00:02<01:50,  2.83s/it][A[A[A


  5%|▌         | 2/40 [00:05<01:47,  2.83s/it][A[A[A


  8%|▊         | 3/40 [00:08<01:45,  2.85s/it][A[A[A


 10%|█         | 4/40 [00:11<01:43,  2.86s/it][A[A[A


 12%|█▎        | 5/40 [00:14<01:41,  2.89s/it][A[A[A


 15%|█▌        | 6/40 [00:17<01:38,  2.91s/it][A[A[A


 18%|█▊        | 7/40 [00:20<01:36,  2.92s/it][A[A[A


 20%|██        | 8/40 [00:23<01:33,  2.94s/it][A[A[A


 22%|██▎       | 9/40 [00:26<01:31,  2.94s/it][A[A[A


 25%|██▌       | 10/40 [00:29<01:28,  2.95s/it][A[A[A


 28%|██▊       | 11/40 [00:32<01:25,  2.96s/it][A[A[A


 30%|███       | 12/40 [00:35<01:22,  2.95s/it][A[A[A


 32%|███▎      | 13/40 [00:38<01:19,  2.94s/it][A[A[A


 35%|███▌      | 14/40 [00:41<01:17,  2.96s/it][A[A[A


 38%|███▊      | 15/40 [00:44<01:14,  2.96s/it][A[A[A


 40%|████      | 16/40 [00:46<01:11,  2.96s/it][A[A[A


 42%|████▎     | 17/40 [00:49<01:08,  2.96s/it][A[A[A


 45

Training 30 0.001





  2%|▎         | 1/40 [00:02<01:51,  2.85s/it][A[A[A


  5%|▌         | 2/40 [00:05<01:48,  2.86s/it][A[A[A


  8%|▊         | 3/40 [00:08<01:47,  2.90s/it][A[A[A


 10%|█         | 4/40 [00:11<01:47,  2.98s/it][A[A[A


 12%|█▎        | 5/40 [00:15<01:52,  3.21s/it][A[A[A


 15%|█▌        | 6/40 [00:19<01:54,  3.36s/it][A[A[A


 18%|█▊        | 7/40 [00:22<01:52,  3.41s/it][A[A[A


 20%|██        | 8/40 [00:26<01:49,  3.44s/it][A[A[A


 22%|██▎       | 9/40 [00:30<01:52,  3.63s/it][A[A[A


 25%|██▌       | 10/40 [00:34<01:48,  3.62s/it][A[A[A


 28%|██▊       | 11/40 [00:37<01:41,  3.51s/it][A[A[A


 30%|███       | 12/40 [00:40<01:36,  3.44s/it][A[A[A


 32%|███▎      | 13/40 [00:43<01:29,  3.33s/it][A[A[A


 35%|███▌      | 14/40 [00:46<01:25,  3.27s/it][A[A[A


 38%|███▊      | 15/40 [00:49<01:21,  3.25s/it][A[A[A


 40%|████      | 16/40 [00:53<01:17,  3.22s/it][A[A[A


 42%|████▎     | 17/40 [00:56<01:16,  3.32s/it][A[A[A


 45

Training 50 0.001





  2%|▎         | 1/40 [00:02<01:53,  2.91s/it][A[A[A


  5%|▌         | 2/40 [00:05<01:51,  2.94s/it][A[A[A


  8%|▊         | 3/40 [00:09<01:53,  3.06s/it][A[A[A


 10%|█         | 4/40 [00:12<01:54,  3.18s/it][A[A[A


 12%|█▎        | 5/40 [00:16<01:54,  3.26s/it][A[A[A


 15%|█▌        | 6/40 [00:19<01:52,  3.31s/it][A[A[A


 18%|█▊        | 7/40 [00:22<01:49,  3.33s/it][A[A[A


 20%|██        | 8/40 [00:26<01:47,  3.35s/it][A[A[A


 22%|██▎       | 9/40 [00:29<01:44,  3.37s/it][A[A[A


 25%|██▌       | 10/40 [00:33<01:41,  3.39s/it][A[A[A


 28%|██▊       | 11/40 [00:37<01:42,  3.52s/it][A[A[A


 30%|███       | 12/40 [00:40<01:41,  3.63s/it][A[A[A


 32%|███▎      | 13/40 [00:44<01:38,  3.66s/it][A[A[A


 35%|███▌      | 14/40 [00:48<01:36,  3.70s/it][A[A[A


 38%|███▊      | 15/40 [00:51<01:30,  3.63s/it][A[A[A


 40%|████      | 16/40 [00:55<01:27,  3.65s/it][A[A[A


 42%|████▎     | 17/40 [00:59<01:24,  3.67s/it][A[A[A


 45

Training 100 0.001





  2%|▎         | 1/40 [00:03<02:06,  3.25s/it][A[A[A


  5%|▌         | 2/40 [00:06<02:04,  3.27s/it][A[A[A


  8%|▊         | 3/40 [00:10<02:11,  3.54s/it][A[A[A


 10%|█         | 4/40 [00:14<02:13,  3.71s/it][A[A[A


 12%|█▎        | 5/40 [00:18<02:13,  3.82s/it][A[A[A


 15%|█▌        | 6/40 [00:22<02:12,  3.89s/it][A[A[A


 18%|█▊        | 7/40 [00:26<02:09,  3.92s/it][A[A[A


 20%|██        | 8/40 [00:31<02:06,  3.97s/it][A[A[A


 22%|██▎       | 9/40 [00:35<02:11,  4.26s/it][A[A[A


 25%|██▌       | 10/40 [00:41<02:15,  4.50s/it][A[A[A


 28%|██▊       | 11/40 [00:45<02:12,  4.55s/it][A[A[A


 30%|███       | 12/40 [00:50<02:07,  4.57s/it][A[A[A


 32%|███▎      | 13/40 [00:54<01:58,  4.41s/it][A[A[A


 35%|███▌      | 14/40 [00:58<01:51,  4.30s/it][A[A[A


 38%|███▊      | 15/40 [01:02<01:46,  4.26s/it][A[A[A


 40%|████      | 16/40 [01:06<01:43,  4.30s/it][A[A[A


 42%|████▎     | 17/40 [01:11<01:38,  4.29s/it][A[A[A


 45

Training 2 0.01





  2%|▎         | 1/40 [00:02<01:42,  2.62s/it][A[A[A


  5%|▌         | 2/40 [00:06<01:48,  2.87s/it][A[A[A


  8%|▊         | 3/40 [00:09<01:47,  2.92s/it][A[A[A


 10%|█         | 4/40 [00:12<01:54,  3.17s/it][A[A[A


 12%|█▎        | 5/40 [00:16<01:59,  3.42s/it][A[A[A


 15%|█▌        | 6/40 [00:20<01:54,  3.36s/it][A[A[A


 18%|█▊        | 7/40 [00:23<01:47,  3.25s/it][A[A[A


 20%|██        | 8/40 [00:26<01:42,  3.19s/it][A[A[A


 22%|██▎       | 9/40 [00:29<01:42,  3.30s/it][A[A[A


 25%|██▌       | 10/40 [00:32<01:36,  3.22s/it][A[A[A


 28%|██▊       | 11/40 [00:35<01:31,  3.17s/it][A[A[A


 30%|███       | 12/40 [00:40<01:37,  3.49s/it][A[A[A


 32%|███▎      | 13/40 [00:42<01:29,  3.33s/it][A[A[A


 35%|███▌      | 14/40 [00:46<01:28,  3.40s/it][A[A[A


 38%|███▊      | 15/40 [00:49<01:20,  3.21s/it][A[A[A


 40%|████      | 16/40 [00:52<01:19,  3.30s/it][A[A[A


 42%|████▎     | 17/40 [00:56<01:17,  3.38s/it][A[A[A


 45

Training 15 0.01





  2%|▎         | 1/40 [00:02<01:53,  2.92s/it][A[A[A


  5%|▌         | 2/40 [00:05<01:50,  2.92s/it][A[A[A


  8%|▊         | 3/40 [00:08<01:49,  2.95s/it][A[A[A


 10%|█         | 4/40 [00:11<01:47,  2.99s/it][A[A[A


 12%|█▎        | 5/40 [00:14<01:45,  3.01s/it][A[A[A


 15%|█▌        | 6/40 [00:17<01:42,  3.00s/it][A[A[A


 18%|█▊        | 7/40 [00:21<01:39,  3.01s/it][A[A[A


 20%|██        | 8/40 [00:24<01:36,  3.03s/it][A[A[A


 22%|██▎       | 9/40 [00:27<01:33,  3.03s/it][A[A[A


 25%|██▌       | 10/40 [00:30<01:30,  3.02s/it][A[A[A


 28%|██▊       | 11/40 [00:33<01:27,  3.02s/it][A[A[A


 30%|███       | 12/40 [00:36<01:24,  3.02s/it][A[A[A


 32%|███▎      | 13/40 [00:39<01:20,  2.99s/it][A[A[A


 35%|███▌      | 14/40 [00:41<01:16,  2.96s/it][A[A[A


 38%|███▊      | 15/40 [00:44<01:13,  2.95s/it][A[A[A


 40%|████      | 16/40 [00:47<01:10,  2.93s/it][A[A[A


 42%|████▎     | 17/40 [00:50<01:07,  2.93s/it][A[A[A


 45

Training 30 0.01





  2%|▎         | 1/40 [00:02<01:51,  2.85s/it][A[A[A


  5%|▌         | 2/40 [00:05<01:48,  2.85s/it][A[A[A


  8%|▊         | 3/40 [00:08<01:48,  2.94s/it][A[A[A


 10%|█         | 4/40 [00:11<01:47,  2.99s/it][A[A[A


 12%|█▎        | 5/40 [00:15<01:45,  3.02s/it][A[A[A


 15%|█▌        | 6/40 [00:18<01:43,  3.05s/it][A[A[A


 18%|█▊        | 7/40 [00:21<01:41,  3.07s/it][A[A[A


 20%|██        | 8/40 [00:24<01:38,  3.08s/it][A[A[A


 22%|██▎       | 9/40 [00:27<01:35,  3.08s/it][A[A[A


 25%|██▌       | 10/40 [00:30<01:32,  3.08s/it][A[A[A


 28%|██▊       | 11/40 [00:33<01:29,  3.08s/it][A[A[A


 30%|███       | 12/40 [00:36<01:26,  3.09s/it][A[A[A


 32%|███▎      | 13/40 [00:39<01:23,  3.09s/it][A[A[A


 35%|███▌      | 14/40 [00:42<01:20,  3.09s/it][A[A[A


 38%|███▊      | 15/40 [00:45<01:17,  3.09s/it][A[A[A


 40%|████      | 16/40 [00:49<01:13,  3.08s/it][A[A[A


 42%|████▎     | 17/40 [00:52<01:10,  3.08s/it][A[A[A


 45

Training 50 0.01





  2%|▎         | 1/40 [00:02<01:53,  2.90s/it][A[A[A


  5%|▌         | 2/40 [00:05<01:50,  2.92s/it][A[A[A


  8%|▊         | 3/40 [00:09<01:53,  3.07s/it][A[A[A


 10%|█         | 4/40 [00:12<01:53,  3.15s/it][A[A[A


 12%|█▎        | 5/40 [00:15<01:52,  3.21s/it][A[A[A


 15%|█▌        | 6/40 [00:19<01:50,  3.24s/it][A[A[A


 18%|█▊        | 7/40 [00:22<01:48,  3.27s/it][A[A[A


 20%|██        | 8/40 [00:25<01:45,  3.28s/it][A[A[A


 22%|██▎       | 9/40 [00:29<01:41,  3.28s/it][A[A[A


 25%|██▌       | 10/40 [00:32<01:38,  3.29s/it][A[A[A


 28%|██▊       | 11/40 [00:36<01:37,  3.37s/it][A[A[A


 30%|███       | 12/40 [00:39<01:38,  3.52s/it][A[A[A


 32%|███▎      | 13/40 [00:43<01:38,  3.65s/it][A[A[A


 35%|███▌      | 14/40 [00:47<01:36,  3.72s/it][A[A[A


 38%|███▊      | 15/40 [00:51<01:30,  3.62s/it][A[A[A


 40%|████      | 16/40 [00:54<01:24,  3.51s/it][A[A[A


 42%|████▎     | 17/40 [00:57<01:19,  3.44s/it][A[A[A


 45

Training 100 0.01





  2%|▎         | 1/40 [00:03<01:57,  3.00s/it][A[A[A


  5%|▌         | 2/40 [00:06<01:55,  3.03s/it][A[A[A


  8%|▊         | 3/40 [00:10<02:02,  3.31s/it][A[A[A


 10%|█         | 4/40 [00:13<02:04,  3.45s/it][A[A[A


 12%|█▎        | 5/40 [00:17<02:04,  3.56s/it][A[A[A


 15%|█▌        | 6/40 [00:21<02:03,  3.63s/it][A[A[A


 18%|█▊        | 7/40 [00:25<02:01,  3.69s/it][A[A[A


 20%|██        | 8/40 [00:29<01:59,  3.73s/it][A[A[A


 22%|██▎       | 9/40 [00:32<01:56,  3.76s/it][A[A[A


 25%|██▌       | 10/40 [00:36<01:53,  3.78s/it][A[A[A


 28%|██▊       | 11/40 [00:40<01:49,  3.79s/it][A[A[A


 30%|███       | 12/40 [00:44<01:46,  3.80s/it][A[A[A


 32%|███▎      | 13/40 [00:48<01:42,  3.80s/it][A[A[A


 35%|███▌      | 14/40 [00:52<01:39,  3.81s/it][A[A[A


 38%|███▊      | 15/40 [00:55<01:35,  3.81s/it][A[A[A


 40%|████      | 16/40 [00:59<01:31,  3.82s/it][A[A[A


 42%|████▎     | 17/40 [01:03<01:28,  3.84s/it][A[A[A


 45

---

# Part 2 - Test your embeddings

## Loading of pretrained models (classifiers)

In [54]:
import os

classifiers_loaded = []

for file in os.listdir("."):
    if file.endswith(".pth"):
        # get the number of neurons from filename
        str_hidden, str_lr, *rest = file.split("_")
        # init the classifier
        classifier = CBOW(len(vectorizer.vocab), args.embedding_dim, args.context_size, int(str_hidden))
        # load the weights / embeddings
        classifier.load_state_dict(torch.load(file))
        # set to eval mode
        classifier.eval()
        # add to the list of loaded classifiers
        classifiers_loaded.append((classifier, float(str_lr)))
        
classifiers_loaded

[(CBOW(
    (embeddings): Embedding(2129, 50)
    (linear1): Linear(in_features=50, out_features=50, bias=True)
    (linear2): Linear(in_features=50, out_features=2129, bias=True)
  ), 0.01), (CBOW(
    (embeddings): Embedding(2129, 50)
    (linear1): Linear(in_features=50, out_features=100, bias=True)
    (linear2): Linear(in_features=100, out_features=2129, bias=True)
  ), 0.01), (CBOW(
    (embeddings): Embedding(2129, 50)
    (linear1): Linear(in_features=50, out_features=15, bias=True)
    (linear2): Linear(in_features=15, out_features=2129, bias=True)
  ), 0.001), (CBOW(
    (embeddings): Embedding(2129, 50)
    (linear1): Linear(in_features=50, out_features=100, bias=True)
    (linear2): Linear(in_features=100, out_features=2129, bias=True)
  ), 0.001), (CBOW(
    (embeddings): Embedding(2129, 50)
    (linear1): Linear(in_features=50, out_features=30, bias=True)
    (linear2): Linear(in_features=30, out_features=2129, bias=True)
  ), 0.001), (CBOW(
    (embeddings): Embedding(21

## Similarity Measure

In [63]:
def get_closest_word_pwd(classifier, word, topn=5):
    word_distance = []
    emb = classifier.embeddings
    test_vocab = dataset.get_vectorizer().vocab
    pdist = nn.PairwiseDistance()
    i = test_vocab.lookup_token(word)
    lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(args.device)
    v_i = emb(lookup_tensor_i)
    for j in range(len(test_vocab)): 
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(args.device)
            v_j = emb(lookup_tensor_j) 
            word_distance.append((test_vocab.lookup_index(j), float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

def get_closest_word_cs(classifier, word, topn=5):
    word_distance = []
    emb = classifier.embeddings
    test_vocab = dataset.get_vectorizer().vocab
    pdist = nn.CosineSimilarity()
    i = test_vocab.lookup_token(word)
    lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(args.device)
    v_i = emb(lookup_tensor_i)
    for j in range(len(test_vocab)): 
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(args.device)
            v_j = emb(lookup_tensor_j) 
            word_distance.append((test_vocab.lookup_index(j), float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[::-1][:topn]

In [67]:
target_classifiers = classifiers if not classifiers else classifiers_loaded

def pretty_print(results):
    """
    Pretty print embedding results.
    """
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

for classifier, lr in target_classifiers:
    word = 'king'
    
    print("=" * 50)
    print("Classifier (LR: " + str(lr) + "): " + str(classifier) + "\n")
    print("===Pairwise Distance===")
    pretty_print(get_closest_word_pwd(classifier, word))
    print("===Cosine Similarit")
    pretty_print(get_closest_word_cs(classifier, word))

Classifier (LR: 0.01): CBOW(
  (embeddings): Embedding(2129, 50)
  (linear1): Linear(in_features=50, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=2129, bias=True)
)

...[9.79] - sweet-seasoned
...[10.35] - stealing
...[10.35] - measured
...[10.54] - himself
...[10.59] - tires
...[0.55] - measured
...[0.51] - sweet-seasoned
...[0.51] - pearl
...[0.48] - sins
...[0.47] - quicker
Classifier (LR: 0.01): CBOW(
  (embeddings): Embedding(2129, 50)
  (linear1): Linear(in_features=50, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=2129, bias=True)
)

...[8.78] - delight
...[9.04] - ’
...[9.06] - bell
...[9.22] - quick
...[9.30] - faith
...[0.58] - delight
...[0.54] - imprint
...[0.46] - lover
...[0.45] - ’
...[0.45] - quick
Classifier (LR: 0.001): CBOW(
  (embeddings): Embedding(2129, 50)
  (linear1): Linear(in_features=50, out_features=15, bias=True)
  (linear2): Linear(in_features=15, out_features=2129, bias=True)
)

...[7.63] - 

In [None]:
nr_examples = len(data)
pred_sum = 0 # softmax check
acc_sum = 0 # accuracy

for i in range(nr_examples):
    ids = vectorizer.vectorize(data[i][0])
    target = test_vocab.tok_to_ids[data[i][1]]
    pred = model(ids) # prediction
    pred_sum += pred.squeeze().sum().item() 
    
    _, pred_index = pred.max(dim=1) # prediction index
    n_correct = torch.eq(pred_index, target)
    acc_sum += n_correct.item()
    
    print("Prediction: " + str(pred_index.item()), "| Target: " + str(target))
    
print(acc_sum / nr_examples)
print(pred_sum / nr_examples)

## Preprocessing

In [None]:
stringo = "here is an [_exit_]"
stringo = re.sub('(\[_).*(_\])', '', stringo)
print(stringo)

In [None]:
#finis is 164924
#beginngin is line 134 --> just keep what's in between those lines


In [None]:
filename = 'shakespeare-corpus.txt'
file = open(filename)
lines = file.readlines()
lines = lines[134:164924]


In [None]:
def mytext(lines):
    corpus = ''
    for line in lines:
        text = re.sub(r'\d+', '', line)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        corpus += text
    return corpus

%time len(mytext(lines))

In [None]:
def mytext2(lines):
    text = ''.join(lines)
    text = re.sub(r'\d+', '', text)
    text = re.sub('SCENE \S', '', text)
    text = re.sub('(\[_).*(_\])', '', text)
    text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
    text = text.lower()
    return text

%time len(mytext2(lines))
