In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import string
from argparse import Namespace
import pandas as pd
import torch.optim as optim

# Vectorize the data

In [2]:
# Vocabulary class, identical to the one used in chapter 3.

class Vocabulary(object):
    """ Class to process text and extract Vocabulary for mapping """

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (book): a flag that indicates whether to add the UNK token
            unk_token (attr): the UNK token to add into the vocabulary
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token for token,idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {
            'token_to_idx':self._token_to_idx,
            'add_unk':self._add_unk,
            'unk_token':self._unk_token
        }
    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """ Update mapping dicts based on the token
        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self,token):
        """ Retrieve the index associated with the token
            or the UNK index if token isn't present.

        Args:
            token (str): the token to look up
        Returns: 
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
            for the UNK functionality)   
        """
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        """ Return the token associated with the index

        Args:
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary".format(index))
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>".format(len(self))
    
    def __len__(self):
        return len(self._token_to_idx)


In [3]:
class SurnameVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self,surname_vocab, nationality_vocab):
        """
        Args:
            surname_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab

    def vectorize(self,surname):
        """ Vectorize the provided surname

        Args:
            surname (str): the surname
        Returns:
            one_hot_matrix (np.ndarray): a matrix of one-hot vectors
        """
        one_hot_matrix_size = (len(self.character_vocab), self.max_surname_length)
        one_hot_matrix = np.zeros(one_hot_matrix_size, dtype=np.float32)
        
        for position_index, character in enumerate(surname):
            character_index = self.character_vocab.lookup_token(character)
            one_hot_matrix[character_index][position_index] = 1
        
        print("ONE HOT MATRIX ;)")
        print(one_hot_matrix)
        
        return one_hot_matrix
    
    @classmethod
    def from_dataframe(cls, surname_df):
        """ Instantiate the vectorizer from the dataset dataframe

        Args:
            surname_df (pandas.DataFrame): the review dataset
        Returns:
            an instance of the SurnameVectorizer
        """
        character_vocab = Vocabulary(unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)
        max_surname_length = 0
        
        for index, row in surname_df.iterrows():
            max_surname_length = max(max_surname_length, len(row.surname))
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)
        
        return cls(surname_vocab, nationality_vocab, max_surname_length)

    def to_serializable(self):
        """ Create the serializable dictionary for caching

        Returns:
            contents (dict): the serializable dictionary
        """
        return {
            'review_vocab': self.review_vocab.to_serializable(),
            'rating_vocab': self.rating_vocab.to_serializable()
        }

In [4]:
class SurnameDataset(Dataset):
    def __init__(self,surname_df,vectorizer):
        """
        Args:
            surname_df (pandas.DataFrame): the dataset
            vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
        """
        self.surname_df = surname_df
        self.surname_vectorizer = vectorizer

        self.train_df = self.surname_df[self.surname_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.surname_df[self.surname_df.split=='val']
        self.val_size = len(self.val_df)

        self.test_df = self.surname_df[self.surname_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {
            'train': (self.train_df,self.train_size),
            'val': (self.val_df,self.val_size),
            'test': (self.test_df,self.test_size)
        }

        self.set_split('train')
        
        # Class weights
        class_counts = surname_df.nationality.value_counts().to_dict()
        def sort_key(item):
            return self.surname_vectorizer.nationality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)

    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        """
        Load dataset and make a new vectorizer from scratch

        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """
        surname_df = pd.read_csv(surname_csv)
        return cls(surname_df,SurnameVectorizer.from_dataframe(surname_df))

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self.surname_vectorizer

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a solumn in the dataframe

        Args:
            split (str): one of "train","val", or "test"
        """

        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self,index):
        """the primary entry point method for PyTorch datasets

        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_targets)
        """
        row = self._target_df.iloc[index]

        surname_vector = \
            self.surname_vectorizer.vectorize(row.surname, self._max_seq_length)

        nationality_index = \
            self.surname_vectorizer.nationality_vocab.lookup_token(row.nationality)
        
        return {'x_surname':surname_vector,
                'y_nationality':nationality_index}

    def get_num_batches(self,batch_size):
        """Given a batch size, return the number of batches in the dataset

        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """ A generator functino which wraps the PyTorch DataLoader. It will ensure
        each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

# Classifier

In [5]:
class SurnameClassifier(nn.Module):
    """ A 2-layer multilayer perceptron for classifying surnames """
    def __init__(self, initial_num_channels, num_classes, num_channels):
        """
        Args:
            input_dim (int): the size of the input vectors
            hidden_dim (int): the output size of the first Linear layer
            output_dim (int): the output size of the second Linear layer
        """
        super(SurnameClassifier, self).__init__()
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=initial_num_channels, out_channels=num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, kernel_size=3),
            nn.ELU()
        )
        
        self.fc = nn.Linear(num_channels, num_classes)
        
    def forward(self, x_surname, apply_softmax=False):
        """
        
        Args:
            x_surname (torch.Tensor): an input data tensor
                x_surname.shape should be (batch,initial_num_channels, max_surname_length)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the cross-entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, num_classes)
        
        """
        features = self.convnet(x_surname)
        #.squeeze(dim=2)
        prediction_vector = self.fc(features)
        
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)
        
        return prediction_vector

# Training

In [6]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

In [7]:
def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

In [8]:
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [9]:
args = Namespace(
    # Data and path information
    surname_csv="project_data/surnames/surnames_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch4/surname_mlp",
    # Model hyper parameters
    hidden_dim = 300,
    # Training hyper parameters
    seed=1337,
    num_epochs=100,
    early_stopping_criteria=5,
    learning_rate=0.001,
    batch_size=64,
    # Runtime options omitted for space
)
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

# The Training Loop

In [10]:
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index

    # Iterate over training dataset

    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # The training routine in 5 steps:

        # ------------------------
        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output
        print(batch_dict['x_surname'])
        y_pred = classifier(batch_dict['x_surname'])
        

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_nationality'])
        loss_batch = loss.to("cpu").item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        
        # ------------------------------------
        # Compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_nationality'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
    
    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)
    
    #Iterate over val dataset

    # setup: batch generator, set loss and acc to -, set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size,device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):

        #step 1. Compute the output
        y_pred = classifier(x_in=batch_dict["x_surname"].float())

        # Step 2. compute the loss
        loss = loss_func(y_pred, batch_dict['y_nationality'])
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # Step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_nationality'])
        running_acc += (acc_batch - running_acc) / (batch_index  + 1)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
print("Training Complete")

NameError: name 'train_state' is not defined

# Test the Model

In [None]:
dataset.set_split('test')
batch_generator = generate_batches(dataset,batch_size=args.batch_size,device=args.device)
running_loss = 0.0
running_acc = 0.0
classifier.train()

for batch_index, batch_dict in enumerate(batch_generator):

    # compute the output
    y_pred = classifier(x_in=batch_dict['x_surname'].float())

    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_nationality'])
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)

    # Compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_nationality'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)

train_state['test_loss'].append(running_loss)
train_state['test_acc'].append(running_acc)

print(train_state['test_loss'])
print(train_state['test_acc'])
print("HD {} = {:.2f} and {:.2f}".format(args.hidden_dim,train_state['test_loss'][0],train_state['test_acc'][0]))