In [27]:
from argparse import Namespace

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [2]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        '''                                                                      
        Args:                                                                    
          input_dim, hidden_dim, output_dim (int): number of perceptrons in 
            each layer (only one hidden layer).                                       
        '''
        super(MultilayerPerceptron, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in, apply_softmax=False):
        '''                                                                      
        Forward Pass                                                             
        Args:                                                                    
          x_in: (torch.Tensor): input tensor, .shape should be                   
            (batch, input_dim)                                                   
          apply_softmax (bool): apply softmax? F if used with x-entropy          
        Returns:                                                                 
          tensor with shape (batch, output_dim)                                  
        '''
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        if apply_softmax:
            output = F.softmax(output, dim=1)
        return output

In [3]:
BATCH = 2
INPUT = 3
HIDDEN = 100
OUTPUT = 4

mlp = MultilayerPerceptron(INPUT, HIDDEN, OUTPUT)
print(mlp)

MultilayerPerceptron(
  (fc1): Linear(in_features=3, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)


In [4]:
def describe(x):
    print(f'Type: {x.type()}')
    print(f'Shape: {x.shape}')
    print(f'Values:\n{x}')

In [5]:
x_input = torch.rand(BATCH, INPUT)
describe(x_input)

Type: torch.FloatTensor
Shape: torch.Size([2, 3])
Values:
tensor([[0.4828, 0.5813, 0.1405],
        [0.1064, 0.2033, 0.3411]])


In [6]:
y_output = mlp(x_input, apply_softmax=False)
describe(y_output)

Type: torch.FloatTensor
Shape: torch.Size([2, 4])
Values:
tensor([[ 0.0281,  0.1006, -0.2076, -0.2045],
        [-0.0256,  0.0311, -0.2647, -0.1446]], grad_fn=<ThAddmmBackward>)


In [7]:
y_output = mlp(x_input, apply_softmax=True)
describe(y_output)

Type: torch.FloatTensor
Shape: torch.Size([2, 4])
Values:
tensor([[0.2734, 0.2940, 0.2160, 0.2167],
        [0.2679, 0.2835, 0.2109, 0.2378]], grad_fn=<SoftmaxBackward>)


In [8]:
y_output.sum()

tensor(2., grad_fn=<SumBackward0>)

In [9]:
class SurnameDataset(Dataset):
    def __getitem__(self, idx):
        row = self._target_df.iloc[idx]
        surname_vector = self._vectorizer.vectorize(row.surname)
        nationality_idx = self._vectorizer.nationality_vocab.lookup_token(
            row.nationality)
        return {'x_surname': surname_vector, 
                'y_nationality': nationality_idx}

In [10]:
class Vocabulary(object):
    '''Class to process text and extract Vocabulary for mapping'''
    def __init__(self, token_to_idx=None, add_unk=True, unk_token='<UNK>'):
        '''
        Args:
          token_to_idx (dict): a pre-existing map of tokens to indices 
          add_unk (bool): flag indicating whether to add the UNK token
          unk_token (str): the UNK token to add to the Vocabulary
        '''
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serializable(self):
        '''Returns a dict that can be serialized'''
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    
    @classmethod
    def from_serializable(cls, contents):
        '''Instantiates the Vocabulary from a serialized dict'''
        return cls(**contents)
    
    def add_token(self, token):
        '''
        Update mapping dicts base on the token
        Args:
          token (str): the item to add to the Vocabulary
        Returns:
          index (int): the int corresponding to the token
        '''
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        '''
        Retrieve the index associated with the token or the UNK index if
        token not present
        Args:
          token (str): the token to look up
        Returns:
          index (int): the index corresponding to the token
        Notes:
          <unk_index> must be >= 0 (having been added into the Vocabulary)
          for UNK functionality
        '''
        if self.add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        '''
        Return the token associated with the index
        Args:
          index (int): the index to look up
        Returns:
          token (str): the token corresponding to the index
        Raises:
          KeyError: if index not in Vocabulary
        '''
        if index not in self._idx_to_token:
            raise KeyError(f'the index {index} is not in the Vocabulary')
        return self._idx_to_token[index]
    
    def __str__(self):
        return f'<Vocabulary(size={len(self)})>'
    
    def __len__(self):
        return len(self._token_to_idx)

In [11]:
class SurnameVectorizer(object):
    '''
    The Vectorizer which coordinates the Vocabularies and puts them to use
    '''
    def __init__(self, surname_vocab, nationality_vocab):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationaility_vocab
        
    def vectorize(self, surname):
        '''
        Vectorize the provided surname
        Args:
          surname (str): surname
        Returns:
          one_hot (np.ndarray): a collapsed one-hot encoding
        '''
        vocab = self.surname_vocab
        one_hot = np.zeros(len(vocab), dtype=np.float32)
        for token in surname:
            one_hot[vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    def from_dataframe(cls, surname_df):
        '''
        Instantiate the vectorizer from the data set DataFrame
        Args:
          surname_df (pandas.DataFrame): surnames data set
        Returns:
          instance of the SurnameVectorizer
        '''
        surname_vocab = Vocabulary(unk_token='@')
        nationality_vocab = Vocabulary(add_unk=False)
        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)
        return cls(surname_vocab, nationality_vocab)

In [12]:
class SurnameClassifier(nn.Module):
    '''A 2-layer multilayer perceptron for classifying surnames'''
    def __init__(self, input_dim, hidden_dim, output_dim):
        '''
        Args:
          input_dim (int): size of input vectors
          hiddden_dim (int): output size of first Linear layer
          output_dim (int): output size of second Linear layer
        '''
        super(SurnameClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hiddent_dim, output_dim)
        
    def forward(self, x_in, apply_softmax=False):
        '''
        The forward pass
        Args:
          x_in (torch.Tensor): an input data tensor (batch, input_dim)
          apply_softmax (bool): should be False if used with the cross-
            entropy losses
        Returns:
          the resulting tensor (batch, output_dim)
        '''
        intermediate_vector = F.relu(self.fc1(x_in))
        prediction_vector = self.fc2(intermediate_vector)
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)
        return prediction_vector
            

In [18]:
DATA_DIR = '../../data/surnames'

In [20]:
ls ../../data/surnames/

surnames.csv              surnames_with_splits.csv


In [28]:
args = Namespace(
    # Data and path info
    suname_csv = f'{DATA_DIR}/surnames_with_splits.csv',
    vectorizer_file = 'vectorizer.json',
    model_state_file = 'model.pth',
    save_dir = 'model_storage/surname_mlp',
    
    # Model hyperparams
    seed = 12345,
    epochs = 100,
    early_stopping_criteria = 5,
    learning_rate = 0.001,
    batch = 64
    
    # Additional runtime options...
)