In [1]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /Users/debora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Imports

import nltk
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from argparse import Namespace



### References

* https://iksinc.online/tag/continuous-bag-of-words-cbow/
* http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_II_The_Continuous_Bag-of-Words_Model.pdf
* https://stackoverflow.com/questions/48479915/what-is-the-preferred-ratio-between-the-vocabulary-size-and-embedding-dimension
* https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py
* https://stackoverflow.com/questions/50792316/what-does-1-mean-in-pytorch-view
* https://www.tensorflow.org/tutorials/text/word_embeddings
* https://pytorch.org/docs/stable/nn.html
* https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
* https://github.com/ChristophAlt/embedding_vectorizer/blob/master/embedding_vectorizer.py
* https://pytorch.org/tutorials/beginner/saving_loading_models.html

## Vocabulary

In [3]:
import nltk
class Vocabulary():
    def __init__(self, add_unk=True):
        super(Vocabulary, self).__init__()
        
        self._token_to_ids = {}
        self._ids_to_token = {}
        
        if add_unk:
            self.unk_index = self.add_token("<UNK>") 

    
    def vocabulary_set(self):
        """this function returns a list of unique tokens"""
        return(list(set(self.tokens)))
    
    def make_dicts(self):
        unique_tokens = list(set(self.tokens))
        tok_to_ix = {}
        ix_to_tok = {}
        for i in range(len(unique_tokens)):
            tok_to_ix.update({unique_tokens[i]: i})
            ix_to_tok.update({i: unique_tokens[i]})
        return tok_to_ix, ix_to_tok
    
    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_ids:
            index = self._token_to_ids[token]
        else:
            index = len(self._token_to_ids)
            self._token_to_ids[token] = index
            self._ids_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_ids.get(token, self.unk_index)
        else:
            return self._token_to_ids[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._ids_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._ids_to_token[index]

    def __len__(self):
        return len(self._token_to_ids)
        

## Vectorizer

In [4]:
class Vectorizer(object):
    def __init__(self, vocabulary):
        self.vocab = vocabulary
        
    @classmethod
    def from_dataframe(cls, cbow_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            cbow_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the Vectorizer
        """
        vocabulary = Vocabulary()
        for index, row in cbow_df.iterrows():
            # add each context word (token) to the vocabulary
            for token in row.context:
                vocabulary.add_token(token)
                
            # add the target word as well
            vocabulary.add_token(row.target)
            
        return cls(vocabulary)
    
    def vectorize(self, context_words):
        context_ids = [self.vocab.lookup_token(w) for w in context_words]
        return torch.tensor(context_ids, dtype=torch.long)


## Dataset

In [5]:
class ShakespeareDataset(Dataset):
    def __init__(self, cbow_df):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
        """
        # 98/1/1% split
        self.train_df, self.val_df, self.test_df = \
          np.split(cbow_df, [int(.98*len(cbow_df)), int(.99*len(cbow_df))])

        self._lookup_dict = {'train': self.train_df,
                             'val': self.val_df,
                             'test': self.test_df}

        self.set_split()
        self._vectorizer = Vectorizer.from_dataframe(self.train_df)

    @classmethod
    def load_and_create_dataset(cls, filepath, context_size, frac=1.0):
        """Load and preprocess the dataset
        
        Args:
            filepath (str): location of the dataset
            context_size (int): size of the context before/after the target word
            frac (float, optional): fraction of the data to use (default 1.0)
        Returns:
            an instance of ShakespeareDataset
        """
        # load the file
        lines = ShakespeareDataset._load_file(filepath)
        # consider the fraction param and throw away the rest
        lines = lines[:int(len(lines)*frac)]
        
        # Preprocess
        tokens = ShakespeareDataset._preprocess_and_split_lines(lines)
        
        # Create DataFrame
        dataframe_data = ShakespeareDataset._create_context_data(
            tokens, 
            context_size
        )
        cbow_df = pd.DataFrame(dataframe_data, columns=['context', 'target'])
        
        # Create an instance 
        return cls(cbow_df)
    
    @staticmethod
    def _load_file(filepath):
        """Load the dataset file into lines"""
        with open(filepath) as file:
            lines = file.readlines()
            file.close()
            return lines
    
    @staticmethod
    def _preprocess_and_split_lines(lines):
        """
        
        Args:
            lines (list): a list of lines of the dataset
        Returns:
            a list of tokens
        """
        
        # Regex
        lines = lines[134:164924] #these numbers are only valid for the full corpus
        text = ''.join(lines)
        text = re.sub(r'\d+', '', text)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        
        # Tokenize
        tokens = nltk.tokenize.word_tokenize(text)
        #tokens = text.split()
        
        return tokens
    
    @staticmethod
    def _create_context_data(tokens, context_size):
        data = []
        for i in range(context_size, len(tokens) - context_size):
            # Context before w_i
            context_before_w = tokens[i - context_size: i]

            # Context after w_i
            context_after_w = tokens[i + 1: i + context_size + 1]

            # Put them together
            context_window = context_before_w + context_after_w

            # Target = w_i
            target = tokens[i]

            # Append in the correct format
            data.append([context_window, target])
        return data

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
        
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_df = self._lookup_dict[split]

    def __len__(self):
        return len(self._target_df)

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        context_vector = self._vectorizer.vectorize(row.context)
        target_index = self._vectorizer.vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## CBOW

In [6]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, nr_hidden_neurons=128):
        super(CBOW, self).__init__()
        self._context_window_size = context_size * 2
        
        # Embedding/input layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Hidden layer
        self.linear1 = nn.Linear(embedding_dim, nr_hidden_neurons) 

        # Output layer 
        self.linear2 = nn.Linear(nr_hidden_neurons, vocab_size)

        
    def forward(self, inputs):
        # shape = (WINDOW_SIZE, EMBEDDING_DIM) -> (EMBEDDING_DIM)
        embeds = self.embeddings(inputs).sum(dim=1)
        
        # finally compute the hidden layer weighted sum (a.k.a. output before using the activation function)
        # ... and don't forget to divide by the number of input vectors
        h =  self.linear1(embeds) / self._context_window_size
        
        # output of the hidden layer
        out =  F.relu(h)
         
        # output
        # also note that we don't compute softmax here because Cross Entropy is used as a loss function
        out = F.relu(self.linear2(out))
        return out

---
## Training

In [7]:
class TrainState:

    def __init__(self, filename):
        self.epoch_index = 0
        self.train_loss = []
        self.val_loss = []
        self.model_filename = filename


    def update(self, model):
        """Handle the training state updates.

        model (nn.Module): model to save
        """
        # Save one model at least once
        if self.epoch_index == 0:
            torch.save(model.state_dict(), self.model_filename)

        # Save model if performance improved
        else:
            loss_prev, loss_cur = self.val_loss[-2:]

            # compare current loss with the previous one
            if loss_cur <= loss_prev:
              # save if needed
              torch.save(model.state_dict(), self.model_filename)

In [8]:
args = Namespace(
    # Data and Path information
    shakespeare_csv_filepath="shakespeare-corpus.txt",
    model_state_file="shakespeare_model.pth",
    model_state_dir="models/",
    # Model hyper parameters
    context_size=2,
    num_neurons=128,
    embedding_dim=50,
    # Training hyper parameters
    seed=1337,
    num_epochs=100,
    learning_rate=0.001,
    batch_size=32,
    # Runtime options
    cuda=True
)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))


# Set seed for reproducibility
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed_all(args.seed)

class CBOWTrainingRoutine:

    def create_new_classifier(self, vocab_len, embedding_dim, context_size,
                              nr_hidden_neurons, device, learning_rate,
                              filedir, filepath):
      # Classifier
      self.loss_func = nn.CrossEntropyLoss()
      classifier = CBOW(
          vocab_len, 
          embedding_dim, 
          context_size, 
          nr_hidden_neurons)
      self.classifier = classifier.to(device)
      self.optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)

      filename = str(nr_hidden_neurons) + "_" + str(learning_rate) + "_" + filepath
      self.train_state = TrainState(filedir + filename)


    def train(self, dataset, num_epochs, batch_size, device):
      for epoch_index in tqdm(range(num_epochs)):
          self.train_state.epoch_index = epoch_index

          # Iterate over training dataset

          # setup: batch generator, set loss to 0, set train mode on

          dataset.set_split('train')
          batch_generator = generate_batches(dataset, 
                                            batch_size=batch_size, 
                                            device=device)
          running_loss = 0.0
          self.classifier.train()

          for batch_index, batch_dict in enumerate(batch_generator):
              # the training routine is these 5 steps:

              # --------------------------------------
              # step 1. zero the gradients
              self.optimizer.zero_grad()

              # step 2. compute the output
              y_pred = self.classifier(batch_dict['x_data'])

              # step 3. compute the loss
              loss = self.loss_func(y_pred, batch_dict['y_target'])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)

              # step 4. use loss to produce gradients
              loss.backward()

              # step 5. use optimizer to take gradient step
              self.optimizer.step()
              # -----------------------------------------

          self.train_state.train_loss.append(running_loss)

          # Iterate over val dataset

          # setup: batch generator, set loss to 0; set eval mode on
          dataset.set_split('val')
          batch_generator = generate_batches(dataset, 
                                            batch_size=batch_size, 
                                            device=device)
          running_loss = 0.0
          self.classifier.eval()

          for batch_index, batch_dict in enumerate(batch_generator):

              # compute the output
              y_pred =  self.classifier(batch_dict['x_data'])

              # compute the loss
              loss = self.loss_func(y_pred, batch_dict['y_target'])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)

          self.train_state.val_loss.append(running_loss)

          self.train_state.update(model=self.classifier)
      

Using CUDA: False


In [9]:
# Dataset
dataset = ShakespeareDataset.load_and_create_dataset(
    args.shakespeare_csv_filepath,
    args.context_size,
    0.005
)

vectorizer = dataset.get_vectorizer()

training_routine = CBOWTrainingRoutine()

In [10]:
import itertools
import time, os

grid_search_params = {
      "lr": [0.001],
      "nr_hidden_neurons": [2, 15]
}


values = [lists for _, lists in grid_search_params.items()]
classifiers = []

timestr = time.strftime("%Y%m%d-%H%M%S")
model_gridsearch_dir = args.model_state_dir + "gridsearch/"
model_dir = model_gridsearch_dir + timestr + "_"  + str(args.num_epochs) + "/"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

for lr, nr_hidden in itertools.product(*values):
    training_routine.create_new_classifier(
      len(vectorizer.vocab), args.embedding_dim, 
      args.context_size, nr_hidden, 
      args.device, lr, model_dir, args.model_state_file
    )

    training_routine.train(
      dataset, 
      args.num_epochs,
      args.batch_size,
      args.device,
    )

    classifiers.append((training_routine.classifier, lr))

HBox(children=(IntProgress(value=0), HTML(value='')))




KeyboardInterrupt: 

---

# Part 2 - Test your embeddings

## Loading of pretrained models (classifiers)

In [28]:
# This cell loads the last performd gridsearch classifiers

import os

grid_search_directories = os.listdir(model_gridsearch_dir)
grid_search_directories.sort()
last_grid_search_dir = grid_search_directories[-1]
target_dir = model_gridsearch_dir + last_grid_search_dir

classifiers_loaded = []

for file in os.listdir(target_dir):
    if file.endswith(".pth"):
        # get the number of neurons from filename
        str_hidden, str_lr, *rest = file.split("_")
        # init the classifier
        classifier = CBOW(len(vectorizer.vocab), args.embedding_dim, args.context_size, int(str_hidden))
        # load the weights / embeddings
        classifier.load_state_dict(torch.load(os.path.join(target_dir, file)))
        # set to eval mode
        classifier.eval()
        # add to the list of loaded classifiers
        classifiers_loaded.append((classifier, float(str_lr)))
        
        
# classifiers from the last grid_search
classifiers_loaded

[(CBOW(
    (embeddings): Embedding(1318, 50)
    (linear1): Linear(in_features=50, out_features=2, bias=True)
    (linear2): Linear(in_features=2, out_features=1318, bias=True)
  ), 0.001)]

## Similarity Measure

In [29]:
def get_closest_word_pwd(classifier, word, topn=5):
    word_distance = []
    emb = classifier.embeddings
    test_vocab = dataset.get_vectorizer().vocab
    pdist = nn.PairwiseDistance()
    i = test_vocab.lookup_token(word)
    lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(args.device)
    v_i = emb(lookup_tensor_i)
    for j in range(len(test_vocab)): 
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(args.device)
            v_j = emb(lookup_tensor_j) 
            word_distance.append((test_vocab.lookup_index(j), float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

def get_closest_word_cs(classifier, word, topn=5):
    word_distance = []
    emb = classifier.embeddings
    test_vocab = dataset.get_vectorizer().vocab
    pdist = nn.CosineSimilarity()
    i = test_vocab.lookup_token(word)
    lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(args.device)
    v_i = emb(lookup_tensor_i)
    for j in range(len(test_vocab)): 
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(args.device)
            v_j = emb(lookup_tensor_j) 
            word_distance.append((test_vocab.lookup_index(j), float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[::-1][:topn]

In [30]:
print(classifiers_loaded)

[(CBOW(
  (embeddings): Embedding(1318, 50)
  (linear1): Linear(in_features=50, out_features=2, bias=True)
  (linear2): Linear(in_features=2, out_features=1318, bias=True)
), 0.001)]


In [31]:
target_classifiers = classifiers_loaded if not classifiers else classifiers

def pretty_print(results):
    """
    Pretty print embedding results.
    """
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

for classifier, lr in target_classifiers:
    word = 'king'
    
    print("=" * 50)
    print("Classifier (LR: " + str(lr) + "): " + str(classifier) + "\n")
    print("===Pairwise Distance (lower better)===")
    pretty_print(get_closest_word_pwd(classifier, word))
    print("===Cosine Similarity (higher better)===")
    pretty_print(get_closest_word_cs(classifier, word))

Classifier (LR: 0.001): CBOW(
  (embeddings): Embedding(1318, 50)
  (linear1): Linear(in_features=50, out_features=2, bias=True)
  (linear2): Linear(in_features=2, out_features=1318, bias=True)
)

===Pairwise Distance (lower better)===
...[6.75] - canopy
...[6.93] - object
...[7.29] - pen
...[7.43] - youthful
...[7.44] - brood
===Cosine Similarity (higher better)===
...[0.46] - pen
...[0.45] - canopy
...[0.42] - youthful
...[0.39] - reigns
...[0.38] - pearl


In [32]:
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

def accuracy_check(classifier):
    pred_sum = 0 # softmax check
    acc_sum = 0 # accuracy
    
        
    dataset.set_split('test')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    loss_func = nn.CrossEntropyLoss()
    running_loss = 0.
    running_acc = 0.
    classifier.eval()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        # compute the output
        y_pred =  classifier(batch_dict['x_data'])

        # compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'])
        loss_t = loss.item()
        print(batch_index)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
        print(acc_t / args.batch_size)
        print()


In [33]:
for classifier, lr in target_classifiers:
    print("Classifier (LR: " + str(lr) + "): " + str(classifier) + "\n")
    accuracy_check(classifier)
    

Classifier (LR: 0.001): CBOW(
  (embeddings): Embedding(1318, 50)
  (linear1): Linear(in_features=50, out_features=2, bias=True)
  (linear2): Linear(in_features=2, out_features=1318, bias=True)
)

0
0.29296875



## Preprocessing

In [16]:
stringo = "here is an [_exit_]"
stringo = re.sub('(\[_).*(_\])', '', stringo)
print(stringo)

here is an 


In [17]:
#finis is 164924
#beginngin is line 134 --> just keep what's in between those lines


In [18]:
filename = 'shakespeare-corpus.txt'
file = open(filename)
lines = file.readlines()
lines = lines[134:164924]


In [19]:
def mytext(lines):
    corpus = ''
    for line in lines:
        text = re.sub(r'\d+', '', line)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        corpus += text
    return corpus

%time len(mytext(lines))

CPU times: user 1.01 s, sys: 15.5 ms, total: 1.03 s
Wall time: 1.12 s


5521081

In [20]:
def mytext2(lines):
    text = ''.join(lines)
    text = re.sub(r'\d+', '', text)
    text = re.sub('SCENE \S', '', text)
    text = re.sub('(\[_).*(_\])', '', text)
    text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
    text = text.lower()
    return text

%time len(mytext2(lines))


CPU times: user 295 ms, sys: 29.2 ms, total: 324 ms
Wall time: 379 ms


5521081

In [23]:
#MOST COMMON WORDS
filename = 'shakespeare-corpus.txt'
file = open(filename)
lines = file.readlines()
lines = lines[134:164924] #these numbers are only valid for the full corpus
text = ''.join(lines)
text = re.sub(r'\d+', '', text)
text = re.sub('SCENE \S', '', text)
text = re.sub('(\[_).*(_\])', '', text)
text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
text = text.lower()
        
# Tokenize
tokens = nltk.tokenize.word_tokenize(text)
#tokens = text.split()

In [27]:
from collections import Counter 
  
def most_frequent(List): 
    occurence_count = Counter(List) 
    return occurence_count.most_common(200)[100:]

for i in most_frequent(tokens):
    print(i)

('must', 1638)
('had', 1555)
('see', 1554)
('why', 1539)
('such', 1528)
(']', 1499)
('where', 1472)
('out', 1452)
('some', 1434)
('who', 1429)
('give', 1424)
('these', 1411)
('first', 1353)
('[', 1342)
('ll', 1341)
('too', 1338)
('take', 1288)
('mine', 1261)
('most', 1237)
('speak', 1207)
('duke', 1157)
('time', 1156)
("'ll", 1142)
('up', 1139)
('never', 1135)
('tell', 1116)
('heart', 1113)
('father', 1105)
('much', 1091)
('doth', 1081)
('think', 1073)
('nor', 1057)
('th', 1043)
('queen', 1023)
('men', 1012)
('lady', 994)
('art', 993)
('great', 966)
('look', 958)
('death', 957)
('life', 946)
('before', 938)
('hear', 914)
('god', 909)
('away', 903)
('made', 899)
('hand', 898)
('master', 861)
('sweet', 856)
('very', 851)
('true', 849)
('fair', 840)
('thus', 830)
("'t", 820)
('tis', 816)
('own', 809)
('prince', 805)
('eyes', 800)
('day', 797)
('again', 795)
('pray', 793)
('ay', 773)
('call', 762)
('any', 761)
('two', 755)
('being', 749)
('honour', 747)
('old', 743)
('other', 743)
('night'

In [None]:
#verbs:is, be, have, will, 'd, 's, do, shall, come, would, was, let, would
#nouns: lord, king, sir, love, man, time, heart, father
#adjectives: good, first, great, sweet, own, old, other