In [1]:
!python -m nltk.downloader punkt
!pip install ray filelock
#!pip uninstall -y pyarrow

[nltk_data] Downloading package punkt to /Users/debora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Collecting ray
[?25l  Downloading https://files.pythonhosted.org/packages/eb/6d/6625e213e3e011c8d4ab870195972f7761ed600c2761316248db00164c61/ray-0.7.6-cp37-cp37m-macosx_10_6_intel.whl (50.6MB)
[K     |████████████████████████████████| 50.6MB 296kB/s eta 0:00:01     |████████████████████            | 31.7MB 395kB/s eta 0:00:48
[?25hCollecting filelock
  Downloading https://files.pythonhosted.org/packages/93/83/71a2ee6158bb9f39a90c0dea1637f81d5eef866e188e1971a1b1ab01a35a/filelock-3.0.12-py3-none-any.whl
Collecting pytest
[?25l  Downloading https://files.pythonhosted.org/packages/93/16/f6dec5178f5f4141e80dfc4812a9aba88f5f29ca881f174ab1851181d016/pytest-5.2.2-py3-none-any.whl (227kB)
[K     |████████████████████████████████| 235kB 5.5MB/s eta 0:00:01
Collecting funcsigs
  Downloading https://files.pythonhosted.org/packages/69/cb/f5be453359271714c01b9bd06126eaf2e368f1f

In [2]:
# Imports

from ray import tune
import nltk
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from argparse import Namespace





### References

* https://iksinc.online/tag/continuous-bag-of-words-cbow/
* http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_II_The_Continuous_Bag-of-Words_Model.pdf
* https://stackoverflow.com/questions/48479915/what-is-the-preferred-ratio-between-the-vocabulary-size-and-embedding-dimension
* https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py
* https://stackoverflow.com/questions/50792316/what-does-1-mean-in-pytorch-view
* https://www.tensorflow.org/tutorials/text/word_embeddings
* https://pytorch.org/docs/stable/nn.html
* https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
* https://github.com/ChristophAlt/embedding_vectorizer/blob/master/embedding_vectorizer.py
* https://pytorch.org/tutorials/beginner/saving_loading_models.html

## Vocabulary

In [3]:
import nltk
class Vocabulary():
    def __init__(self, add_unk=True):
        super(Vocabulary, self).__init__()
        
        self._token_to_ids = {}
        self._ids_to_token = {}
        
        if add_unk:
            self.unk_index = self.add_token("<UNK>") 

    
    def vocabulary_set(self):
        """this function returns a list of unique tokens"""
        return(list(set(self.tokens)))
    
    def make_dicts(self):
        unique_tokens = list(set(self.tokens))
        tok_to_ix = {}
        ix_to_tok = {}
        for i in range(len(unique_tokens)):
            tok_to_ix.update({unique_tokens[i]: i})
            ix_to_tok.update({i: unique_tokens[i]})
        return tok_to_ix, ix_to_tok
    
    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_ids:
            index = self._token_to_ids[token]
        else:
            index = len(self._token_to_ids)
            self._token_to_ids[token] = index
            self._ids_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_ids.get(token, self.unk_index)
        else:
            return self._token_to_ids[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._ids_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._ids_to_token[index]

    def __len__(self):
        return len(self._token_to_ids)
        

## Vectorizer

In [4]:
class Vectorizer(object):
    def __init__(self, vocabulary):
        self.vocab = vocabulary
        
    @classmethod
    def from_dataframe(cls, cbow_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            cbow_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the Vectorizer
        """
        vocabulary = Vocabulary()
        for index, row in cbow_df.iterrows():
            # add each context word (token) to the vocabulary
            for token in row.context:
                vocabulary.add_token(token)
                
            # add the target word as well
            vocabulary.add_token(row.target)
            
        return cls(vocabulary)
    
    def vectorize(self, context_words):
        context_ids = [self.vocab.lookup_token(w) for w in context_words]
        return torch.tensor(context_ids, dtype=torch.long)


## Dataset

In [5]:
class ShakespeareDataset(Dataset):
    def __init__(self, context_size, cbow_df):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
        """
        self.context_size = context_size
        # 98/1/1% split
        self.train_df, self.val_df, self.test_df = \
          np.split(cbow_df, [int(.98*len(cbow_df)), int(.99*len(cbow_df))])

        self._lookup_dict = {'train': self.train_df,
                             'val': self.val_df,
                             'test': self.test_df}

        self.set_split()
        self._vectorizer = Vectorizer.from_dataframe(self.train_df)

    @classmethod
    def load_and_create_dataset(cls, filepath, context_size, frac=1.0):
        """Load and preprocess the dataset
        
        Args:
            filepath (str): location of the dataset
            context_size (int): size of the context before/after the target word
            frac (float, optional): fraction of the data to use (default 1.0)
        Returns:
            an instance of ShakespeareDataset
        """
        # load the file
        lines = ShakespeareDataset._load_file(filepath)
        # consider the fraction param and throw away the rest
        lines = lines[:int(len(lines)*frac)]
        
        # Preprocess
        tokens = ShakespeareDataset._preprocess_and_split_lines(lines)
        
        # Create DataFrame
        dataframe_data = ShakespeareDataset._create_context_data(
            tokens, 
            context_size
        )
        cbow_df = pd.DataFrame(dataframe_data, columns=['context', 'target'])
        
        # Create an instance 
        return cls(context_size, cbow_df)
    
    @staticmethod
    def _load_file(filepath):
        """Load the dataset file into lines"""
        with open(filepath) as file:
            lines = file.readlines()
            file.close()
            return lines
    
    @staticmethod
    def _preprocess_and_split_lines(lines):
        """
        
        Args:
            lines (list): a list of lines of the dataset
        Returns:
            a list of tokens
        """
        
        # Regex
        lines = lines[134:164924] #these numbers are only valid for the full corpus
        text = ''.join(lines)
        text = re.sub(r'\d+', '', text)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        
        # Tokenize
        tokens = nltk.tokenize.word_tokenize(text)
        #tokens = text.split()
        
        return tokens
    
    @staticmethod
    def _create_context_data(tokens, context_size):
        data = []
        for i in range(context_size, len(tokens) - context_size):
            # Context before w_i
            context_before_w = tokens[i - context_size: i]

            # Context after w_i
            context_after_w = tokens[i + 1: i + context_size + 1]

            # Put them together
            context_window = context_before_w + context_after_w

            # Target = w_i
            target = tokens[i]

            # Append in the correct format
            data.append([context_window, target])
        return data

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
        
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_df = self._lookup_dict[split]

    def __len__(self):
        return len(self._target_df)

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        context_vector = self._vectorizer.vectorize(row.context)
        target_index = self._vectorizer.vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## CBOW

In [6]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, nr_hidden_neurons=128):
        super(CBOW, self).__init__()
        self._context_window_size = context_size * 2
        
        # Embedding/input layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Hidden layer
        self.linear1 = nn.Linear(embedding_dim, nr_hidden_neurons) 

        # Output layer 
        self.linear2 = nn.Linear(nr_hidden_neurons, vocab_size)

        
    def forward(self, inputs):
        # shape = (WINDOW_SIZE, EMBEDDING_DIM) -> (EMBEDDING_DIM)
        embeds = self.embeddings(inputs).sum(dim=1)
        
        # finally compute the hidden layer weighted sum (a.k.a. output before using the activation function)
        # ... and don't forget to divide by the number of input vectors
        h =  self.linear1(embeds) / self._context_window_size
        
        # output of the hidden layer
        out =  F.relu(h)
         
        # output
        # also note that we don't compute softmax here because Cross Entropy is used as a loss function
        out = F.relu(self.linear2(out))
        return out

---
## Training

In [7]:
class TrainState:

    def __init__(self, filename):
        self.epoch_index = 0
        self.train_loss = []
        self.val_loss = []
        self.model_filename = filename


    def update(self, model):
        """Handle the training state updates.

        model (nn.Module): model to save
        """
        # Save one model at least once
        if self.epoch_index == 0:
            #torch.save(model.state_dict(), self.model_filename)
            pass

        # Save model if performance improved
        else:
            loss_prev, loss_cur = self.val_loss[-2:]

            # compare current loss with the previous one
            if loss_cur <= loss_prev:
              # save if needed
              #torch.save(model.state_dict(), self.model_filename)
              pass

In [8]:
args = Namespace(
    # Data and Path information
    shakespeare_csv_filepath="shakespeare-corpus.txt",
    model_state_file="shakespeare_model.pth",
    # Model hyper parameters
    context_size=2,
    num_neurons=128,
    embedding_dim=50,
    # Training hyper parameters
    seed=1337,
    num_epochs=40,
    learning_rate=0.001,
    batch_size=32,
    # Runtime options
    cuda=True
)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))


# Set seed for reproducibility
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed_all(args.seed)

class CBOWTrainingRoutine:

    def create_new_classifier(self, vocab_len, embedding_dim, context_size,
                              nr_hidden_neurons, device, learning_rate,
                              filename):
      # Classifier
      self.loss_func = nn.CrossEntropyLoss()
      classifier = CBOW(
          vocab_len, 
          embedding_dim, 
          context_size, 
          nr_hidden_neurons)
      self.classifier = classifier.to(device)
      self.optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)

      self.train_state = TrainStae(filename)


    def train(self, dataset, num_epochs):
      for epoch_index in tqdm(range(num_epochs)):
          self.train_state.epoch_index = epoch_index

          # Iterate over training dataset

          # setup: batch generator, set loss to 0, set train mode on

          dataset.set_split('train')
          batch_generator = generate_batches(dataset, 
                                            batch_size=batch_size, 
                                            device=device)
          running_loss = 0.0
          self.classifier.train()

          for batch_index, batch_dict in enumerate(batch_generator):
              # the training routine is these 5 steps:

              # --------------------------------------
              # step 1. zero the gradients
              self.optimizer.zero_grad()

              # step 2. compute the output
              y_pred = self.classifier(batch_dict['x_data'])

              # step 3. compute the loss
              loss = self.loss_func(y_pred, batch_dict['y_target'])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)

              # step 4. use loss to produce gradients
              loss.backward()

              # step 5. use optimizer to take gradient step
              self.optimizer.step()
              # -----------------------------------------

          self.train_state.train_loss.append(running_loss)

          # Iterate over val dataset

          # setup: batch generator, set loss to 0; set eval mode on
          dataset.set_split('val')
          batch_generator = generate_batches(dataset, 
                                            batch_size=batch_size, 
                                            device=device)
          running_loss = 0.0
          self.classifier.eval()

          for batch_index, batch_dict in enumerate(batch_generator):

              # compute the output
              y_pred =  self.classifier(batch_dict['x_data'])

              # compute the loss
              loss = self.loss_func(y_pred, batch_dict['y_target'])
              loss_t = loss.item()
              running_loss += (loss_t - running_loss) / (batch_index + 1)

          self.train_state.val_loss.append(running_loss)

          self.train_state.update(model=self.classifier)
      return train_state
      

Using CUDA: False


In [9]:
class CBOWTrainable(tune.Trainable):

    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, args.model_state_file)
        torch.save(self.classifier.state_dict(), checkpoint_path)
        return checkpoint_path

    def _restore(self, checkpoint_path):
        self.classifier.load_state_dict(torch.load(checkpoint_path))

    def _setup(self, config):
      self.dataset = config.get("dataset")
      vectorizer = dataset.get_vectorizer()
      training_routine = config.get("training_routine")
      training_routine.create_new_classifier(
          len(vectorizer.vocab), args.embedding_dim, 
          self.dataset.context_size, config.get("nr_hidden_neurons"), 
          args.device, config.get("lr"), args.model_state_file
      )
      self.training_routine = training_routine
      

    def _train(self):
      train_state = self.training_routine.train(
          self.dataset, 
          args.num_epochs,
          args.batch_size,
          args.device,
      )
      return { 'loss': train_state.val_loss[-1] }

In [10]:
# Dataset
dataset = ShakespeareDataset.load_and_create_dataset(
    args.shakespeare_csv_filepath,
    args.context_size,
    0.2
)

training_routine = CBOWTrainingRoutine()

In [None]:
analysis = tune.run(
    CBOWTrainable, 
    config={
      "lr": tune.grid_search([0.001, 0.01]),
      "nr_hidden_neurons": tune.grid_search([10, 30, 50, 100]),
      "training_routine": training_routine,
      "dataset": dataset
    },
    local_dir=".",
    resources_per_trial={
        "gpu": int(args.cuda),
        "cpu": 2
    }
)

2019-11-02 14:21:32,468	INFO resource_spec.py:205 -- Starting Ray with 1.46 GiB memory available for workers and up to 0.74 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/4 CPUs, 0/0 GPUs, 0.0/1.46 GiB heap, 0.0/0.49 GiB objects
Memory usage on this node: 5.8/8.0 GiB



---

# Part 2 - Test your embeddings

In [None]:
analysis

In [None]:
# Part2 supplied function
def get_closest_word(word, topn=5):
    word_distance = []
    emb = classifier.embeddings
    test_vocab = dataset.get_vectorizer().vocab
    pdist = nn.PairwiseDistance()
    i = test_vocab.lookup_token(word)
    lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(args.device)
    v_i = emb(lookup_tensor_i)
    for j in range(len(test_vocab)): 
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(args.device)
            v_j = emb(lookup_tensor_j) 
            word_distance.append((test_vocab.lookup_index(j), float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

get_closest_word('desire')

In [None]:
nr_examples = len(data)
pred_sum = 0 # softmax check
acc_sum = 0 # accuracy

for i in range(nr_examples):
    ids = vectorizer.vectorize(data[i][0])
    target = test_vocab.tok_to_ids[data[i][1]]
    pred = model(ids) # prediction
    pred_sum += pred.squeeze().sum().item() 
    
    _, pred_index = pred.max(dim=1) # prediction index
    n_correct = torch.eq(pred_index, target)
    acc_sum += n_correct.item()
    
    print("Prediction: " + str(pred_index.item()), "| Target: " + str(target))
    
print(acc_sum / nr_examples)
print(pred_sum / nr_examples)

## Preprocessing

In [None]:
stringo = "here is an [_exit_]"
stringo = re.sub('(\[_).*(_\])', '', stringo)
print(stringo)

In [None]:
#finis is 164924
#beginngin is line 134 --> just keep what's in between those lines


In [None]:
filename = 'shakespeare-corpus.txt'
file = open(filename)
lines = file.readlines()
lines = lines[134:164924]


In [None]:
def mytext(lines):
    corpus = ''
    for line in lines:
        text = re.sub(r'\d+', '', line)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        corpus += text
    return corpus

%time len(mytext(lines))

In [None]:
def mytext2(lines):
    text = ''.join(lines)
    text = re.sub(r'\d+', '', text)
    text = re.sub('SCENE \S', '', text)
    text = re.sub('(\[_).*(_\])', '', text)
    text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
    text = text.lower()
    return text

%time len(mytext2(lines))
