In [224]:
# Imports

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from tqdm import tqdm
import nltk
import re


torch.manual_seed(1)

<torch._C.Generator at 0x11443a350>

### References

* https://iksinc.online/tag/continuous-bag-of-words-cbow/
* http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_II_The_Continuous_Bag-of-Words_Model.pdf
* https://stackoverflow.com/questions/48479915/what-is-the-preferred-ratio-between-the-vocabulary-size-and-embedding-dimension
* https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py
* https://stackoverflow.com/questions/50792316/what-does-1-mean-in-pytorch-view
* https://www.tensorflow.org/tutorials/text/word_embeddings
* https://pytorch.org/docs/stable/nn.html
* https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
* https://github.com/ChristophAlt/embedding_vectorizer/blob/master/embedding_vectorizer.py

## Vocabulary

In [225]:
import nltk
class Vocabulary():
    def __init__(self, filepath):
        super(Vocabulary, self).__init__()
        self.filepath = filepath
        self.tokens = self.nltk_tokenize()
        self.tok_to_ids, self.ids_to_tok = self.make_dicts()
    
    def readfile(self):
        """this function opens the file and returns the text in a string"""
        file = open(self.filepath)
        lines = file.readlines()
        #lines = lines[134:164924] #these numbers are only valid for the full corpus
        text = ''.join(lines)
        text = re.sub(r'\d+', '', text)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        file.close()
        return text
    
    def nltk_tokenize(self):
        """this function tokenizes the text and returns a list of tokens as strings"""
        text = self.readfile()
        tokens = nltk.tokenize.word_tokenize(text)
        return tokens
    
    def vocabulary_set(self):
        """this function returns a list of unique tokens"""
        return(list(set(self.tokens)))
    
    def make_dicts(self):
        unique_tokens = list(set(self.tokens))
        tok_to_ix = {}
        ix_to_tok = {}
        for i in range(len(unique_tokens)):
            tok_to_ix.update({unique_tokens[i]: i})
            ix_to_tok.update({i: unique_tokens[i]})
        return tok_to_ix, ix_to_tok

    def __len__(self):
        return len(self.tok_to_ids)
        

## Vectorizer

In [226]:
class Vectorizer(object):
    def __init__(self, vocabulary):
        self.vocab = vocabulary
    
    def vectorize(self, context_words):
        context_ids = [self.vocab.tok_to_ids[w] for w in context_words]
        return torch.tensor(context_ids, dtype=torch.long)


## Dataset

In [227]:
class ShakespeareDataset(Dataset):
    def __init__(self, cbow_df, vectorizer):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
            vectorizer (Vectorizer): vectorizer instantiated from dataset
        """
        self.cbow_df = cbow_df
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, cbow_df.context))
        
        self.train_df = self.cbow_df[self.cbow_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.cbow_df[self.cbow_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, cbow_csv):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            cbow_csv (str): location of the dataset
        Returns:
            an instance of CBOWDataset
        """
        cbow_df = pd.read_csv(cbow_csv)
        train_cbow_df = cbow_df[cbow_df.split=='train']
        return cls(cbow_df, CBOWVectorizer.from_dataframe(train_cbow_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, cbow_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer. 
        Used in the case in the vectorizer has been cached for re-use
        
        Args:
            cbow_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of CBOWDataset
        """
        cbow_df = pd.read_csv(cbow_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(cbow_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file
        
        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of CBOWVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return CBOWVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json
        
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
        
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        context_vector = \
            self._vectorizer.vectorize(row.context, self._max_seq_length)
        target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [228]:
filepath = 'test_corpus.txt'
test_vocab = Vocabulary(filepath)
vectorizer = Vectorizer(test_vocab)

# Size of the context windows, 2 and 5 are supposed to be used in ex02...
# range \in [2, 1/2 * document_length - 1]
CONTEXT_SIZE = 2

# let's stick with this notation for now ;)
CONTEXT_WINDOW_SIZE = CONTEXT_SIZE * 2


# Data creation - get context around the target word
data = []
tokens = test_vocab.tokens
for i in range(CONTEXT_SIZE, len(tokens) - CONTEXT_SIZE):
    # Context before w_i
    context_before_w = tokens[i - CONTEXT_SIZE: i]
    
    # Context after w_i
    context_after_w = tokens[i + 1: i + CONTEXT_SIZE + 1]
    
    # Put them together
    context_window = context_before_w + context_after_w
    
    # Target = w_i
    target = tokens[i]
    
    # Append in the correct format
    data.append((context_window, target))



## CBOW

In [229]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_window_size, nr_hidden_neurons=128):
        super(CBOW, self).__init__()
        self.context_window_size = context_window_size
        
        # Embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # note: this probably doesn't deal with 'UNK' words
        self.linear1 = nn.Linear(embedding_dim, nr_hidden_neurons)  
        
        # output layer
        self.linear2 = nn.Linear(nr_hidden_neurons, vocab_size)

        
    def forward(self, inputs):
        # shape = (WINDOW_SIZE, EMBEDDING_DIM) -> (EMBEDDING_DIM)
        embeds = sum(self.embeddings(inputs))

        # shape = (1, EMBEDDING_DIM)
        # -1 param in view() ... "the actual value for this dimension will be inferred so that the number of elements in the view matches the original number of elements."
        embeds_2D = embeds.view(1, -1)
        
        # finally compute the hidden layer weighted sum (a.k.a. output before using the activation function)
        # ... and don't forget to divide by the number of input vectors
        h =  self.linear1(embeds_2D) / self.context_window_size
        
        # output of the hidden layer
        out =  F.relu(h) 
         
        out = self.linear2(out)
        log_probs = F.softmax(out, dim=-1)
        return log_probs

## Training

In [230]:
NUM_ITERATIONS = 100
NUM_NEURONS = 100
EMBEDDING_DIM = 50

losses = []
loss_function = nn.CrossEntropyLoss()
model = CBOW(len(test_vocab), EMBEDDING_DIM, CONTEXT_WINDOW_SIZE, NUM_NEURONS)

optimizer = optim.Adam(model.parameters(), lr=0.01)

print(model.embeddings.weight)

for epoch in tqdm(range(NUM_ITERATIONS)):
    total_loss = 0
    for context, target in data:
        # Step1. Create input vector 
        context_vector_ids = vectorizer.vectorize(context)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        softmax = model(context_vector_ids)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        target = torch.tensor(vectorizer.vocab.tok_to_ids[target], dtype=torch.long).view(1)
        loss = loss_function(softmax, target)
        
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    
print(losses)
print(model.embeddings.weight)


  0%|          | 0/100 [00:00<?, ?it/s][A

Parameter containing:
tensor([[-1.5256, -0.7502, -0.6540,  ..., -0.6298, -0.9274,  0.5451],
        [ 0.0663, -0.4370,  0.7626,  ...,  1.1899,  0.8165, -0.9135],
        [ 1.3851, -0.8138, -0.9276,  ...,  0.6419,  0.4730, -0.4286],
        ...,
        [ 0.2124,  0.9873, -0.2969,  ..., -2.1730,  0.1277, -1.1812],
        [ 0.0054, -0.3642,  0.4567,  ..., -1.5041, -0.7924,  0.0683],
        [ 1.0057,  0.0652,  1.9921,  ...,  0.4940,  1.0178,  0.2038]],
       requires_grad=True)



  1%|          | 1/100 [00:00<00:35,  2.76it/s][A
  2%|▏         | 2/100 [00:00<00:35,  2.78it/s][A
  3%|▎         | 3/100 [00:01<00:36,  2.69it/s][A
  4%|▍         | 4/100 [00:01<00:35,  2.74it/s][A
  5%|▌         | 5/100 [00:01<00:33,  2.81it/s][A
  6%|▌         | 6/100 [00:02<00:34,  2.73it/s][A
  7%|▋         | 7/100 [00:02<00:33,  2.76it/s][A
  8%|▊         | 8/100 [00:02<00:34,  2.67it/s][A
  9%|▉         | 9/100 [00:03<00:34,  2.67it/s][A
 10%|█         | 10/100 [00:03<00:33,  2.71it/s][A
 11%|█         | 11/100 [00:04<00:33,  2.64it/s][A
 12%|█▏        | 12/100 [00:04<00:32,  2.67it/s][A
 13%|█▎        | 13/100 [00:04<00:32,  2.69it/s][A
 14%|█▍        | 14/100 [00:05<00:33,  2.60it/s][A
 15%|█▌        | 15/100 [00:05<00:32,  2.61it/s][A
 16%|█▌        | 16/100 [00:06<00:32,  2.55it/s][A
 17%|█▋        | 17/100 [00:06<00:31,  2.62it/s][A
 18%|█▊        | 18/100 [00:06<00:31,  2.62it/s][A
 19%|█▉        | 19/100 [00:07<00:31,  2.55it/s][A
 20%|██        | 20/

[2137.0678267478943, 2124.274205684662, 2121.040813446045, 2111.0573649406433, 2107.6739778518677, 2108.5109457969666, 2108.5132751464844, 2102.539351463318, 2106.4591636657715, 2102.3065090179443, 2105.758240699768, 2104.2444472312927, 2096.718550682068, 2093.114233970642, 2091.3732018470764, 2094.2937202453613, 2092.539632797241, 2092.5157675743103, 2091.380917072296, 2093.526613712311, 2095.473023414612, 2091.632371902466, 2088.8245248794556, 2094.1868958473206, 2087.564106941223, 2087.538487434387, 2086.2816591262817, 2087.015751361847, 2086.5449895858765, 2086.552490711212, 2085.7277884483337, 2088.77752161026, 2089.54544878006, 2089.5604429244995, 2089.5518498420715, 2089.559072494507, 2089.5588278770447, 2089.5594487190247, 2090.5582280158997, 2088.5690383911133, 2089.957133769989, 2088.552963733673, 2088.552481651306, 2089.4736919403076, 2089.5250549316406, 2088.1021733283997, 2087.542254447937, 2086.548861503601, 2086.528482913971, 2086.5507249832153, 2086.502411842346, 2086.5

---
## OOP Training

In [182]:
#shakespeare_csv_filepath = 'test_corpus.txt'
#dataset = ShakespeareDataset.load_dataset_and_make_vectorizer(shakespeare_csv_filepath)
#dataset.save_vectorizer(args.vectorizer_file)
    
#vectorizer = dataset.get_vectorizer()

#classifier = CBOWClassifier(vocabulary_size=len(vectorizer.cbow_vocab), embedding_size=args.embedding_size)

---

# Part 2 - Test your embeddings

In [190]:
# Part2 supplied function
def get_closest_word(word, topn=5):
    word_distance = []
    emb = model.embeddings
    pdist = nn.PairwiseDistance()
    i = test_vocab.tok_to_ids[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long) 
    v_i = emb(lookup_tensor_i)
    for j in range(len(test_vocab)): 
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j) 
            word_distance.append((test_vocab.ids_to_tok[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1]) 
    return word_distance[:topn]

get_closest_word('desire')

[('uneared', 8.883015632629395),
 ('gaudy', 9.166439056396484),
 ('by', 9.197699546813965),
 ('viewest', 9.315560340881348),
 ('another', 9.614222526550293)]

In [184]:
nr_examples = len(data)
pred_sum = 0 # softmax check
acc_sum = 0 # accuracy

for i in range(nr_examples):
    ids = vectorizer.vectorize(data[i][0])
    target = test_vocab.tok_to_ids[data[i][1]]
    pred = model(ids) # prediction
    pred_sum += pred.squeeze().sum().item() 
    
    _, pred_indices = pred.max(dim=1) # prediction index
    n_correct = torch.eq(pred_indices, target)
    acc_sum += n_correct.item()
    
print(acc_sum / nr_examples)
print(pred_sum / nr_examples)

0.1330049261083744
0.9999999994127621


In [164]:
stringo = "here is an [_exit_]"
stringo = re.sub('(\[_).*(_\])', '', stringo)
print(stringo)

here is an 


In [None]:
#finis is 164924
#beginngin is line 134 --> just keep what's in between those lines


In [200]:
filename = 'shakespeare-corpus.txt'
file = open(filename)
lines = file.readlines()
lines = lines[134:164924]


In [221]:
def mytext(lines):
    corpus = ''
    for line in lines:
        text = re.sub(r'\d+', '', line)
        text = re.sub('SCENE \S', '', text)
        text = re.sub('(\[_).*(_\])', '', text)
        text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
        text = text.lower()
        corpus += text
    return corpus

%time len(mytext(lines))

CPU times: user 1.01 s, sys: 17.7 ms, total: 1.03 s
Wall time: 1.08 s


5521081

In [222]:
def mytext2(lines):
    text = ''.join(lines)
    text = re.sub(r'\d+', '', text)
    text = re.sub('SCENE \S', '', text)
    text = re.sub('(\[_).*(_\])', '', text)
    text = re.sub(r'[\\[#$%*+—/<=>?{}|~@]+_', '', text)
    text = text.lower()
    return text

%time len(mytext2(lines))


CPU times: user 294 ms, sys: 20.7 ms, total: 315 ms
Wall time: 330 ms


5521081