In [None]:
#referring NLP pytorch oreilly page no. 125
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [None]:
from numpy.lib.shape_base import split

class CBOWDataset(Dataset):
  def __init__(self, cbow_df, vectorizer):
    """
    Args:
    review_df (pandas.DataFrame): the dataset
    vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
    """
    self.cbow_df = cbow_df
    self._vectorizer = vectorizer

    measure_len = lambda context : len(context.split(" "))
    self._max_seq_length = max(map(measure_len, cbow_df.context))

    self.train_df = self.cbow_df[self.cbow_df.split=='train']
    self.train_size = len(self.train_df)
    
    self.val_df = self.cbow_df[self.cbow_df.split=='val']
    self.validation_size = len(self.val_df)
    
    self.test_df = self.cbow_df[self.cbow_df.split=='test']
    self.test_size = len(self.test_df)
    
    self._lookup_dict = {'train': (self.train_df, self.train_size),
                        'val': (self.val_df, self.validation_size),
                        'test': (self.test_df, self.test_size)}
    self.set_split('train')

  @classmethod
  def load_dataset_and_make_vectorizer(cls,cbow_csv):
    '''
    Args : Location of dataset
    Return : an instance of CBOWDataset

    '''
    cbow_df = pd.read_csv(cbow_csv)
    train_cbow_df = cbow_df[cbow_df.split=='train'][:50]
    print()
    print('we are in dataset class ')
    print('from Dataset class it goes to Vectorizer ')
    return cls(cbow_df,CBOWVectorizer.from_dataframe(train_cbow_df))
  
  def get_vectorizer(self):
    """ returns the vectorizer """
    return self._vectorizer
 
  def save_vectorizer(self, vectorizer_filepath):
      """saves the vectorizer to disk using json
      
      Args:
          vectorizer_filepath (str): the location to save the vectorizer
      """
      with open(vectorizer_filepath, "w") as fp:
          json.dump(self._vectorizer.to_serializable(), fp) 

  def set_split(self,split='train'):
    self._target_split = split
    self._target_df,self._target_size = self._lookup_dict[split]
    print('set split function is called and traget_df is set')
            
  def __len__(self):
    return self._target_size

  def __getitem__(self,index):
    '''
    Args :
        index: index of datapoitn
    Return : a dict with xdata nd label

    '''
    print('__getitem is getting called now.')
    row = self._target_df.iloc[index]
    context_vector = self._vectorizer.vectorize(row.context,self._max_seq_length)
    target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)

    return {'x_data':context_vector,'y_target':target_index}

  def get_num_batches(self,batch_size):
   
    return len(self)//batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                    drop_last=True, device="cpu"): 
  """
  A generator function which wraps the PyTorch DataLoader. It will 
    ensure each tensor is on the write device location.
  """
  dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                          shuffle=shuffle, drop_last=drop_last)

  for data_dict in dataloader:
      out_data_dict = {}
      for name, tensor in data_dict.items():
          out_data_dict[name] = data_dict[name].to(device)
      yield out_data_dict

**Vocabulary**

In [None]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            mask_token (str): the MASK token to add into the Vocabulary; indicates
                a position that will not be used in updating the model's parameters
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
            
        """
        print('initializing all variables.')
        if token_to_idx is None:
          print('first time token_to_idx is None and hence it is {}')
          token_to_idx = {}
        self._token_to_idx = token_to_idx
        print(f'token_to_idx  {self._token_to_idx}')

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token

        print(f'mask_token  {self._mask_token}')
        self.mask_index = self.add_token(self._mask_token)
        print(f'mask_index  {self.mask_index }')
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token, 
                'mask_token': self._mask_token}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        print(f' before adding token {token} , curret  token_to_id is {self._token_to_idx} ')
        if token in self._token_to_idx:
          print('looks like token is already in dic')
          index = self._token_to_idx[token]
        else:
          print(f'--token not in idx')
          index = len(self._token_to_idx)
          self._token_to_idx[token] = index
          self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary
        
        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [None]:
class CBOWVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""    
    def __init__(self, cbow_vocab):
        """
        Args:
            cbow_vocab (Vocabulary): maps words to integers
        """
        self.cbow_vocab = cbow_vocab

    def vectorize(self, context, vector_length=-1):
        """
        Args:
            context (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        """

        indices = [self.cbow_vocab.lookup_token(token) for token in context.split(' ')]
        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.cbow_vocab.mask_index

        return out_vector
    
    @classmethod
    def from_dataframe(cls, cbow_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            cbow_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the CBOWVectorizer
        """
        print('Vocab class instance is instatntiated, it goes to Vocab class')
        cbow_vocab = Vocabulary()
        print('In Vectorizer class adding tokens')
        print("---->")
        for index, row in cbow_df.iterrows():
            for token in row.context.split(' '):
              print(f'for each token in row we call ad_token {token}')
              print('-->goes to add_token function in Vocab')
              cbow_vocab.add_token(token)
            print('adding target token {row.target}')  
            cbow_vocab.add_token(row.target)
            
        return cls(cbow_vocab)

    @classmethod
    def from_serializable(cls, contents):
        cbow_vocab = \
            Vocabulary.from_serializable(contents['cbow_vocab'])
        return cls(cbow_vocab=cbow_vocab)

    def to_serializable(self):
        return {'cbow_vocab': self.cbow_vocab.to_serializable()}

In [None]:
# ind = [1,2,3]
# vl = len(ind)
# ov = np.zeros(vl,dtype=np.int64)
# print(ov)
# ov[:len(ind)] =ind
# print(ov)

[0 0 0]
[1 2 3]


**The Model : CBOW**

In this example, x is a tensor of shape (batch_size, seq_length) containing the word IDs for each input sequence in the batch. We pass x through the embedding layer to obtain a tensor of shape (batch_size, seq_length, embedding_dim) containing the dense embeddings for each word. We then sum the embeddings along the sequence dimension (dim=1) to obtain a tensor of shape (batch_size, embedding_dim) representing the entire input sequence. Finally, we pass this tensor through a linear layer with a single output node (self.fc1) and a sigmoid activation function to obtain the predicted sentiment score (y_out).

In [None]:
class CBOWClassifier(nn.Module):
  def __init__(self,vocab_size,emb_size,padding_idx = 0):

    '''
    Args :
        vocab_size : no of vocbulary
        emb_size : size of embeddings
        padding_idx = default = 0: Embedding will not use this index
    '''

    super(CBOWClassifier,self).__init__()
    self.embedding = nn.Embedding(num_embeddings = vocab_size,embedding_dim = emb_size,padding_idx = padding_idx)

    self.fc1 = nn.Linear(in_features=emb_size,out_features=vocab_size)


  def forward(self,x_in,apply_softmax = False):
    x_embed = self.embedding(x_in)
    x_embed_sum = x_embed.sum(dim=1)
    x_dropout = F.dropout(x_embed_sum,0.3)
    y_out = self.fc1(x_dropout)
    
    if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
    return y_out

**Helper Function**

In [None]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [None]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

In [None]:

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [None]:
args = Namespace(
    # Data and Path information
    cbow_csv="/content/frankenstein_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="/content/drive/MyDrive/pytorch/model_storage/ch5/cbow",
    # Model hyper parameters
    embedding_size=50,
    # Training hyper parameters
    seed=1337,
    num_epochs=1,
    learning_rate=0.0001,
    batch_size=32,
    early_stopping_criteria=3,
    # Runtime options
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))


# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	/content/drive/MyDrive/pytorch/model_storage/ch5/cbow/vectorizer.json
	/content/drive/MyDrive/pytorch/model_storage/ch5/cbow/model.pth
Using CUDA: False


**Inititalizations**

In [None]:
print("Loading dataset and creating vectorizer")
dataset = CBOWDataset.load_dataset_and_make_vectorizer(args.cbow_csv)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()
print()
classifier = CBOWClassifier(vocab_size=len(vectorizer.cbow_vocab), 
                            emb_size=args.embedding_size)
                          

Loading dataset and creating vectorizer

we are in dataset class 
from Dataset class it goes to Vectorizer 
Vocab class instance is instatntiated, it goes to Vocab class
initializing all variables.
first time token_to_idx is None and hence it is {}
token_to_idx  {}
mask_token  <MASK>
 before adding token <MASK> , curret  token_to_id is {} 
--token not in idx
mask_index  0
 before adding token <UNK> , curret  token_to_id is {'<MASK>': 0} 
--token not in idx
In Vectorizer class adding tokens
---->
for each token in row we call ad_token ,
-->goes to add_token function in Vocab
 before adding token , , curret  token_to_id is {'<MASK>': 0, '<UNK>': 1} 
--token not in idx
for each token in row we call ad_token or
-->goes to add_token function in Vocab
 before adding token or , curret  token_to_id is {'<MASK>': 0, '<UNK>': 1, ',': 2} 
--token not in idx
for each token in row we call ad_token the
-->goes to add_token function in Vocab
 before adding token the , curret  token_to_id is {'<MASK>'

In [None]:
print(len(vectorizer.cbow_vocab))

44


In [None]:
dataset.set_split('train')

set split function is called and traget_df is set


In [None]:
pd.read_csv("/content/frankenstein_with_splits.csv")

Unnamed: 0,context,target,split
0,", or the",frankenstein,train
1,frankenstein or the modern,",",train
2,"frankenstein , the modern prometheus",or,train
3,"frankenstein , or modern prometheus by",the,train
4,", or the prometheus by mary",modern,train
...,...,...,...
90693,newsletter to hear new ebooks .,about,test
90694,to hear about ebooks .,new,test
90695,hear about new .,ebooks,test
90696,about new ebooks,.,test


In [None]:
classifier = classifier.to(args.device)
    
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)
print('train_staet is intiliaaaaazws')
train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', 
                          total=args.num_epochs,
                          position=0)
print('-------dataset is plit into train and ready for bacths')
dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
      
      train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

      dataset.set_split('train')
      print('-------dataset is plit into train and ready for bacths')
      batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
      print(f'batch_generator {list(batch_generator)}')
      running_loss = 0.0
      running_acc = 0.0
      classifier.train()

#       for batch_index, batch_dict in enumerate(batch_generator):
#           # the training routine is these 5 steps:

#           # --------------------------------------
#           # step 1. zero the gradients
#           optimizer.zero_grad()

#           # step 2. compute the output
#           y_pred = classifier(x_in=batch_dict['x_data'])

#           # step 3. compute the loss
#           loss = loss_func(y_pred, batch_dict['y_target'])
#           loss_t = loss.item()
#           running_loss += (loss_t - running_loss) / (batch_index + 1)

#           # step 4. use loss to produce gradients
#           loss.backward()

#           # step 5. use optimizer to take gradient step
#           optimizer.step()
#           # -----------------------------------------
#           # compute the accuracy
#           acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
#           running_acc += (acc_t - running_acc) / (batch_index + 1)

#           # update bar
#           train_bar.set_postfix(loss=running_loss, acc=running_acc, 
#                           epoch=epoch_index)
#           train_bar.update()

#       train_state['train_loss'].append(running_loss)
#       train_state['train_acc'].append(running_acc)

#       # Iterate over val dataset

#       # setup: batch generator, set loss and acc to 0; set eval mode on
#       dataset.set_split('val')
#       batch_generator = generate_batches(dataset, 
#                                           batch_size=args.batch_size, 
#                                           device=args.device)
#       running_loss = 0.
#       running_acc = 0.
#       classifier.eval()

#       for batch_index, batch_dict in enumerate(batch_generator):

#           # compute the output
#           y_pred =  classifier(x_in=batch_dict['x_data'])

#           # step 3. compute the loss
#           loss = loss_func(y_pred, batch_dict['y_target'])
#           loss_t = loss.item()
#           running_loss += (loss_t - running_loss) / (batch_index + 1)

#           # compute the accuracy
#           acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
#           running_acc += (acc_t - running_acc) / (batch_index + 1)
#           val_bar.set_postfix(loss=running_loss, acc=running_acc, 
#                           epoch=epoch_index)
#           val_bar.update()

#       train_state['val_loss'].append(running_loss)
#       train_state['val_acc'].append(running_acc)

#       train_state = update_train_state(args=args, model=classifier,
#                                         train_state=train_state)

#       scheduler.step(train_state['val_loss'][-1])

#       if train_state['stop_early']:
#           break

#       train_bar.n = 0
#       val_bar.n = 0
#       epoch_bar.update()
except KeyboardInterrupt:
   print("Exiting loop")


train_staet is intiliaaaaazws


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  epoch_bar = tqdm_notebook(desc='training routine',


training routine:   0%|          | 0/1 [00:00<?, ?it/s]

-------dataset is plit into train and ready for bacths
set split function is called and traget_df is set


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  train_bar = tqdm_notebook(desc='split=train',


split=train:   0%|          | 0/1984 [00:00<?, ?it/s]

set split function is called and traget_df is set


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  val_bar = tqdm_notebook(desc='split=val',


split=val:   0%|          | 0/425 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [ 1, 33,  1,  1,  1,  1],
        [ 1,  8, 36,  1,  1,  4],
        [ 1, 33,  4,  2,  1,  4],
        [ 1,  1,  1, 39,  1,  1],
        [ 1,  1,  1,  1, 43,  0],
        [ 1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1]]), 'y_target': tensor([ 1,  1,  1,  1,  1, 15,  4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,
         1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])}, {'x_data': tensor([[ 1,  1, 19,  1,  0,  0],
        [ 1,  2,  1,  4,  1,  1],
        [ 1,  2, 19, 36,  1,  1],
        [ 1,  1,  1, 19,  1, 23],
        [ 1,  1,  1,  1,  4,  1],
        [ 1,  1, 15,  0,  0,  0],
        [ 1,  1,  1, 27,  4,  1],
        [ 1,  1,  1,  1,  1, 15],
        [ 1,  1,  4,  1, 15, 43],
        [ 1,  1,  1,  1,  2,  1],
        [ 1,  1,  1,  1, 33,  4],
        [ 1,  1,  1, 43,  0,  0],
        [ 2,  4,  1, 27,  1,  1],
        [ 1,  1,  2,  3,  2,  1],
        [ 1,  1, 33,  1,  1,  1],
        [ 1,  1

In [None]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)
loss_func = nn.CrossEntropyLoss()

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(x_in=batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem is getting called now.
__getitem i

In [None]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 7.7635532704521655;
Test Accuracy: 10.338235294117647


In [None]:
def pretty_print(results):
    """
    Pretty print embedding results.
    """
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

def get_closest(target_word, word_to_idx, embeddings, n=5):
    """
    Get the n closest
    words to your word.
    """

    # Calculate distances to all other words
    
    word_embedding = embeddings[word_to_idx[target_word.lower()]]
    distances = []
    for word, index in word_to_idx.items():
        if word == "<MASK>" or word == target_word:
            continue
        distances.append((word, torch.dist(word_embedding, embeddings[index])))
    
    results = sorted(distances, key=lambda x: x[1])[1:n+2]
    return results


In [None]:
word = input('Enter a word: ')
embeddings = classifier.embedding.weight.data  #44,50
word_to_idx = vectorizer.cbow_vocab._token_to_idx
pretty_print(get_closest(word, word_to_idx, embeddings, n=5))

Enter a word: ,
...[8.93] - letter
...[9.07] - which
...[9.16] - that
...[9.20] - such
...[9.25] - of
...[9.37] - st


In [None]:
vectorizer.cbow_vocab._token_to_idx

{'<MASK>': 0,
 '<UNK>': 1,
 ',': 2,
 'or': 3,
 'the': 4,
 'frankenstein': 5,
 'modern': 6,
 'prometheus': 7,
 'by': 8,
 'mary': 9,
 'wollstonecraft': 10,
 'godwin': 11,
 'shelley': 12,
 'letter': 13,
 'st': 14,
 '.': 15,
 'petersburgh': 16,
 'dec': 17,
 'th': 18,
 'to': 19,
 'mrs': 20,
 'saville': 21,
 'england': 22,
 'you': 23,
 'will': 24,
 'rejoice': 25,
 'hear': 26,
 'that': 27,
 'no': 28,
 'disaster': 29,
 'has': 30,
 'accompanied': 31,
 'commencement': 32,
 'of': 33,
 'an': 34,
 'enterprise': 35,
 'which': 36,
 'have': 37,
 'regarded': 38,
 'with': 39,
 'such': 40,
 'evil': 41,
 'forebodings': 42,
 '': 43}

In [None]:
classifier.embedding.weight.data.shape

torch.Size([44, 50])

In [None]:
target_words = ['frankenstein', 'monster', 'science', 'sickness', 'lonely', 'happy']

embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[7.45] - irradiated
...[7.79] - shrivelled
...[7.82] - gush
...[7.83] - enslaved
...[7.84] - men
...[7.85] - liable
...[7.83] - saw
...[7.91] - kid
...[7.94] - cares
...[7.95] - ultimately
...[7.95] - truly
...[7.98] - confused
...[7.06] - impression
...[7.08] - mutual
...[7.14] - mist
...[7.17] - darkened
...[7.25] - swelling
...[7.37] - tempted
...[6.30] - while
...[6.56] - literally
...[6.61] - probabilities
...[6.62] - foundations
...[6.65] - awoke
...[6.69] - consoles
...[6.92] - moonlight
...[6.94] - unveiled
...[7.16] - ought
...[7.24] - heartily
...[7.25] - bed
...[7.30] - orb
...[6.42] - bottom
...[6.50] - injury
...[6.60] - chimney
...[6.62] - chivalry
...[6.62] - evening
...[6.63] - lingered
