In [1]:
!cat ../README.md

# NLP with PyTorch

Learning from:

D. Rao & B. McMahan (2019) _Natural Language Processing with PyTorch_ (O'Reilly)

<a href="https://github.com/joosthub/PyTorchNLPBook">Book Repo</a>.


In [1]:
from   argparse import Namespace
from   collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from   torch.optim import Adam
from   torch.utils.data import Dataset, DataLoader

In [2]:
class Vocabulary:
    '''Class to process text and extract Vocabulary for mapping'''
    
    def __init__(
            self, token_to_idx=None, add_unk=True, unk_token='<UNK>'):
        '''
        Args:
          token_to_idx (dict): a pre-existing map of tokens to indices 
          add_unk (bool): flag indicating whether to add the UNK token
          unk_token (str): the UNK token to add to the Vocabulary
        '''
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {
            idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serializable(self):
        '''Returns a dict that can be serialized'''
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    
    @classmethod
    def from_serializable(cls, contents):
        '''Instantiates the Vocabulary from a serialized dict'''
        return cls(**contents)
    
    def add_token(self, token):
        '''
        Update mapping dicts base on the token
        Args:
          token (str): the item to add to the Vocabulary
        Returns:
          index (int): the int corresponding to the token
        '''
        try:
            index = self._token_to_idx[token]
        except KeyError:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def add_many(self, tokens):
        '''
        Add a list of tokens int the Vocabulary
        Args:
          tokens (list<str>)
        Returns:
          indices (list<int>): indices for input tokens
        '''
        return [self.add_token(token) for token in tokens]
    
    def lookup_token(self, token):
        '''
        Retrieve the index associated with the token or the UNK index if
        token not present
        Args:
          token (str): the token to look up
        Returns:
          index (int): the index corresponding to the token
        Notes:
          <unk_index> must be >= 0 (having been added into the Vocabulary)
          for UNK functionality
        '''
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        '''
        Return the token associated with the index
        Args:
          index (int): the index to look up
        Returns:
          token (str): the token corresponding to the index
        Raises:
          KeyError: if index not in Vocabulary
        '''
        if index not in self._idx_to_token:
            raise KeyError(f'the index {index} is not in the Vocabulary')
        return self._idx_to_token[index]
    
    def __str__(self):
        return f'<Vocabulary(size={len(self)})>'
    
    def __len__(self):
        return len(self._token_to_idx)

In [3]:
class SurnameVectorizer:
    '''
    The Vectorizer which coordinates the Vocabularies and puts them to use
    '''
    
    def __init__(self, surname_vocab, nationality_vocab):
        '''
        Args:
          surname_vocab (Vocabulary): maps characters to integers
          nationality_vocab (Vocabulary): maps nationalities to integers
        '''
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
        
    def vectorize(self, surname):
        '''
        Vectorize the provided surname
        Args:
          surname (str): surname
        Returns:
          one_hot (np.ndarray): a collapsed one-hot encoding
        '''
        vocab = self.surname_vocab
        one_hot = np.zeros(len(vocab), dtype=np.float32)
        for token in surname:
            one_hot[vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    def from_dataframe(cls, surname_df):
        '''
        Instantiate the vectorizer from the data set DataFrame
        Args:
          surname_df (pandas.DataFrame): surnames data set
        Returns:
          instance of the SurnameVectorizer
        '''
        surname_vocab = Vocabulary(unk_token='@')
        nationality_vocab = Vocabulary(add_unk=False)
        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)
        return cls(surname_vocab, nationality_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        surname_vocab = Vocabulary.from_serializable(
            contents['surname_vocab'])
        nationality_vocab = Vocabulary.from_serializable(
            contents['nationality_vocab'])
        return cls(surname_vocab, nationality_vocab)
    
    def to_serializable(self):
        return {
            'surname_vocab': self.surname_vocab.to_serializable(),
            'nationality_vocab': self.nationality_vocab.to_serializable()}

In [29]:
### HERE ###
class SurnameDataset(Dataset):
    def __init__(self, surname_df, vectorizer):
        '''
        Args:
          surname_df (pandas.DataFrame): the data
          vectorizer (SurnameVectorizer): vectorizer instantiated from 
            data set
        '''
        self.surname_df = surname_df
        self._vectorizer = vectorizer
        self.train_df = self.surname_df[self.surname_df.split == 'train']
        self.val_df   = self.surname_df[self.surname_df.split == 'val']
        self.test_df  = self.surname_df[self.surname_df.split == 'test']
        self.train_size      = len(self.train_df)
        self.validation_size = len(self.val_df)
        self.test_size       = len(self.test_df)
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val':   (self.val_df,   self.validation_size),
            'test':  (self.test_df,  self.test_size)}
        self.set_split('train')
        # Class weights
        class_counts = surname_df.nationality.value_counts().to_dict()
        
        def sort_key(item):
            return self._vectorizer.nationality_vocab.lookup_token(item[0])
        
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1. / torch.tensor(frequencies, 
                                               dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        '''
        Load data set and make a new vectorizer form scratch
        Args:
          surname_csv (str): location of data set
        Returns:
          an instance of SurnameDataset
        '''
        surname_df = pd.read_csv(surname_csv)
        train_surname_df = surname_df[surname_df.split == 'train']
        return cls(surname_df, 
                   SurnameVectorizer.from_dataframe(train_surname_df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(
            cls, surname_csv, vectorizer_filepath):
        '''
        Load data set and the corresponding vectorizer.  Used in the case
        that vectorizer has been cached for re-use
        Args:
          surname_csv (str): location of data set
          vectorizer_filepath (str): location of saved vectorizer
        Returns:
          an instance of SurnameDataset
        '''
        surname_df = pd.read_csv(surname_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(surname_df, vectorizer)
        
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        '''
        Static method for loading the vectorizer from file
        Args:
          vectorizer_filepath (str): location of the serialized vectorizer
        Returns:
          an instance of SurnameVectorizer
        '''
        with open(vectorizer_file_path) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))
        
    def save_vectorizer(self, vectorizer_filepath):
        '''Saves the vectorizer to disk using json.
        Args:
          vectorizer_filepath (str): location to save vectorizer
        '''
        with open(vectorizer_filepath, 'w') as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
            
    def get_vectorizer(self):
        return self._vectorizer
    
    def set_split(self, split='train'):
        '''Selects the splits in the data set using a column in the DF'''
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
            
    def __getitem__(self, idx):
        row = self._target_df.iloc[idx]
        surname_vector = self._vectorizer.vectorize(row.surname)
        nationality_idx = self._vectorizer.nationality_vocab.lookup_token(
            row.nationality)
        return {'x_surname': surname_vector, 
                'y_nationality': nationality_idx}
    
    def get_num_batches(self, batch_size):
        '''Given a batch size, return number of batches in data set'''
        return len(self) // batch_size    

In [6]:
x_input = torch.rand(BATCH, INPUT)
describe(x_input)

Type: torch.FloatTensor
Shape: torch.Size([2, 3])
Values:
tensor([[2.4515e-01, 8.9136e-01, 3.9119e-04],
        [6.7195e-01, 5.7999e-01, 1.0281e-01]])


In [7]:
y_output = mlp(x_input, apply_softmax=False)
describe(y_output)

Type: torch.FloatTensor
Shape: torch.Size([2, 4])
Values:
tensor([[-0.0720,  0.1462, -0.3504, -0.0369],
        [-0.1236,  0.1536, -0.3663,  0.1226]], grad_fn=<AddmmBackward>)


In [8]:
y_output = mlp(x_input, apply_softmax=True)
describe(y_output)

Type: torch.FloatTensor
Shape: torch.Size([2, 4])
Values:
tensor([[0.2477, 0.3081, 0.1875, 0.2566],
        [0.2281, 0.3010, 0.1790, 0.2918]], grad_fn=<SoftmaxBackward>)


In [9]:
y_output.sum()

tensor(2., grad_fn=<SumBackward0>)

In [12]:
def generate_batches(
        dataset, batch_size, shuffle=True, drop_last=True, device='cpu'):
    '''
    Generator function that wraps the PyTorch DataLoader. Ensures each 
    tensor is on the write device location
    '''
    dataloader = DataLoader(dataset=dataset, 
                            batchsize=batch_size, 
                            shuffle=shuffle, 
                            drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [31]:
class SurnameClassifier(nn.Module):
    '''A 2-layer multilayer perceptron for classifying surnames'''
    def __init__(self, input_dim, hidden_dim, output_dim):
        '''
        Args:
          input_dim (int): size of input vectors
          hiddden_dim (int): output size of first Linear layer
          output_dim (int): output size of second Linear layer
        '''
        super(SurnameClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x_in, apply_softmax=False):
        '''
        The forward pass
        Args:
          x_in (torch.Tensor): an input data tensor (batch, input_dim)
          apply_softmax (bool): should be False if used with the cross-
            entropy losses
        Returns:
          the resulting tensor (batch, output_dim)
        '''
        intermediate_vector = F.relu(self.fc1(x_in))
        prediction_vector = self.fc2(intermediate_vector)
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)
        return prediction_vector
            

In [15]:
DATA_DIR = '../../data/surnames'

In [16]:
ls ../../data/surnames/

surnames.csv              surnames_with_splits.csv


In [20]:
#!cat ../../data/surnames/surnames.csv

In [18]:
args = Namespace(
    # Data and path info
    surname_csv = f'{DATA_DIR}/surnames_with_splits.csv',
    vectorizer_file = 'vectorizer.json',
    model_state_file = 'model.pth',
    save_dir = 'model_storage/surname_mlp',
    
    # Model hyperparams
    seed = 12345,
    epochs = 100,
    early_stopping_criteria = 5,
    learning_rate = 0.001,
    batch = 64
    
    # Additional runtime options...
)

In [35]:
dataset = SurnameDataset.load_dataset_and_make_vectorizer(
    args.surname_csv)
vectorizer = dataset.get_vectorizer()
classifier = SurnameClassifier(
    input_dim=len(vectorizer.surname_vocab),
    hidden_dim=HIDDEN,
    output_dim=len(vectorizer.nationality_vocab))
#classifier = classifier.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = Adam(classifier.parameters(), lr=args.learning_rate)

In [36]:
optimizer.zero_grad()
y_pred = classifier(batch_dict['x_surname'])
loss = loss_func(y_pred, batch_dict['y_nationality'])
loss_batch = lass.to('cpu').item()
running_loss += (loss_batch - running_loss) / (batch_index + 1)
loss.backward()
optimizer.step()

NameError: name 'batch_dict' is not defined