In [1]:
import random
import os
import sys
import logging

import numpy as np


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tnrange

%matplotlib inline
from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
class DataLoader(object):

    def __init__(self, data_dir, args):    
        
        # loading vocab (we require this to map words to their indices)
        vocab_path = os.path.join(data_dir, 'words.txt')
        self.vocab = {}
        with open(vocab_path, encoding='utf-8') as f:
            for i, l in enumerate(f.read().splitlines()):
                self.vocab[l] = i
        
        # setting the indices for UNKnown words and PADding symbols
        self.unk_ind = self.vocab[args['unk_word']]
        self.pad_ind = self.vocab[args['pad_word']]
                
        # loading tags (we require this to map tags to their indices)
        tags_path = os.path.join(data_dir, 'tags.txt')
        self.tag_map = {}
        with open(tags_path, encoding='utf-8') as f:
            for i, t in enumerate(f.read().splitlines()):
                self.tag_map[t] = i

    def load_sentences_labels(self, sentences_file, labels_file, d):
        """
        Loads sentences and labels from their corresponding files. Maps tokens and tags to their indices and stores
        them in the provided dict d.

        Args:
            sentences_file: (string) file with sentences with tokens space-separated
            labels_file: (string) file with NER tags for the sentences in labels_file
            d: (dict) a dictionary in which the loaded data is stored
        """

        sentences = []
        labels = []

        with open(sentences_file, encoding='utf-8') as f:
            for sentence in f.read().splitlines():
                # replace each token by its index if it is in vocab
                # else use index of UNK_WORD
                s = [self.vocab[token] if token in self.vocab 
                     else self.unk_ind
                     for token in sentence.split(' ')]
                sentences.append(s)
        
        with open(labels_file, encoding='utf-8') as f:
            for sentence in f.read().splitlines():
                # replace each label by its index
                l = [self.tag_map[label] for label in sentence.split(' ')]
                labels.append(l)        

        # checks to ensure there is a tag for each token
        assert len(labels) == len(sentences)
        for i in range(len(labels)):
            assert len(labels[i]) == len(sentences[i])

        # storing sentences and labels in dict d
        d['data'] = sentences
        d['labels'] = labels
        d['size'] = len(sentences)

    def load_data(self, types, data_dir):
        """
        Loads the data for each type in types from data_dir.

        Args:
            types: (list) has one or more of 'train', 'val', 'test' depending on which data is required
            data_dir: (string) directory containing the dataset

        Returns:
            data: (dict) contains the data with labels for each type in types

        """
        data = {}
        
        for split in ['train', 'val', 'test']:
            if split in types:
                sentences_file = os.path.join(data_dir, split, "sentences.txt")
                labels_file = os.path.join(data_dir, split, "labels.txt")
                data[split] = {}
                self.load_sentences_labels(sentences_file, labels_file, data[split])

        return data

    def data_iterator(self, data, args, shuffle=False):
        """
        Returns a generator that yields batches data with labels. Batch size is args['batch_size']. Expires after one
        pass over the data.

        Args:
            data: (dict) contains data which has keys 'data', 'labels' and 'size'
            args: (dict) hyperparameters of the training process.
            shuffle: (bool) whether the data should be shuffled

        Yields:
            batch_data: dimension batch_size x seq_len with the sentence data
            batch_labels: dimension batch_size x seq_len with the corresponding labels

        """

        # make a list that decides the order in which we go over the data- this avoids explicit shuffling of data
        order = list(range(data['size']))
        if shuffle:
            random.seed(230)
            random.shuffle(order)
            
        batch_size = args['batch_size']

        # one pass over data
        for i in range((data['size']+1)//args['batch_size']):
            # fetch sentences and tags
            batch_sentences = [data['data'][idx] for idx in order[i*batch_size:(i+1)*batch_size]]
            batch_tags = [data['labels'][idx] for idx in order[i*batch_size:(i+1)*batch_size]]

            # compute length of longest sentence in batch
            batch_max_len = max([len(s) for s in batch_sentences])

            # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1
            # initialising labels to -1 differentiates tokens with tags from PADding tokens
            batch_data = self.pad_ind*np.ones((len(batch_sentences), batch_max_len))
            batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

            # copy the data to the numpy array
            for j in range(len(batch_sentences)):
                cur_len = len(batch_sentences[j])
                batch_data[j][:cur_len] = batch_sentences[j]
                batch_labels[j][:cur_len] = batch_tags[j]

            # since all data are indices, we convert them to torch LongTensors
            batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

            # shift tensors to GPU if available
            if args['cuda']:
                batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda()
    
            yield batch_data, batch_labels

In [4]:
class NER(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, out_dim):

        super().__init__()

        # the embedding takes as input the vocab_size and the embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # the LSTM takes as input the size of its input (embedding_dim), its hidden size
        # for more details on how to use it, check out the documentation
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # the fully connected layer transforms the output to give the final output layer
        self.fc = nn.Linear(hidden_dim, out_dim)
        
    def forward(self, seq):
        """
        Args:
            seq: contains a batch of sentences, of dimension batch_size x seq_len, where seq_len is
               the length of the longest sentence in the batch. For sentences shorter than seq_len, the remaining
               tokens are PADding tokens. Each row is a sentence with each element corresponding to the index of
               the token in the vocab.

        Returns:
            out: dimension batch_size*seq_len x num_tags with the log probabilities of tokens for each token
                 of each sentence.
        """
        #                                -> batch_size x seq_len
        # apply the embedding layer that maps each token to its embedding
        seq = self.embedding(seq)            # dim: batch_size x seq_len x embedding_dim

        # run the LSTM along the sentences of length seq_len
        seq, _ = self.lstm(seq)              # dim: batch_size x seq_len x lstm_hidden_dim

        # make the Variable contiguous in memory (a PyTorch artefact)
        seq = seq.contiguous()

        # reshape the Variable so that each row contains one token
        seq = seq.view(-1, seq.shape[2])       # dim: batch_size*seq_len x lstm_hidden_dim

        # apply the fully connected layer and obtain the output (before softmax) for each token
        seq = self.fc(seq)                   # dim: batch_size*seq_len x num_tags

        # apply log softmax on each token's output (this is recommended over applying softmax
        # since it is numerically more stable)
        return F.log_softmax(seq, dim=1)   # dim: batch_size*seq_len x num_tags

In [5]:
def loss_fn(outputs, labels):
    """
    Compute the cross entropy loss given outputs from the model and labels for all tokens. Exclude loss terms
    for PADding tokens.

    Args:
        outputs: (Variable) dimension batch_size*seq_len x num_tags - log softmax output of the model
        labels: (Variable) dimension batch_size x seq_len where each element is either a label in [0, 1, ... num_tag-1],
                or -1 in case it is a PADding token.

    Returns:
        loss: (Variable) cross entropy loss for all tokens in the batch

    Note: you may use a standard loss function from http://pytorch.org/docs/master/nn.html#loss-functions. This example
          demonstrates how you can easily define a custom loss function.
    """

    # reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)

    # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms
    mask = (labels >= 0).float()

    # indexing with negative values is not supported. Since PADded tokens have label -1, we convert them to a positive
    # number. This does not affect training, since we ignore the PADded tokens with the mask.
    labels = labels % outputs.shape[1]

    num_tokens = int(torch.sum(mask).item())

    # compute cross entropy loss for all tokens (except PADding tokens), by multiplying with mask.
    return -torch.sum(outputs[range(outputs.shape[0]), labels]*mask)/num_tokens

In [6]:
def accuracy(outputs, labels):
    """
    Compute the accuracy, given the outputs and labels for all tokens. Exclude PADding terms.

    Args:
        outputs: (np.ndarray) dimension batch_size*seq_len x num_tags - log softmax output of the model
        labels: (np.ndarray) dimension batch_size x seq_len where each element is either a label in
                [0, 1, ... num_tag-1], or -1 in case it is a PADding token.

    Returns: (float) accuracy in [0,1]
    """

    # reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.ravel()

    # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms
    mask = (labels >= 0)

    # np.argmax gives us the class predicted for each token by the model
    outputs = np.argmax(outputs, axis=1)

    # compare outputs with labels and divide by number of tokens (excluding PADding tokens)
    return np.sum(outputs==labels)/float(np.sum(mask))

In [7]:
# maintain all metrics required in this dictionary- these are used in the training and evaluation loops
metrics = {
    'accuracy': accuracy,
    # could add more metrics such as accuracy for each token type
}
class RunningAverage():

    def __init__(self):
        self.steps = 0
        self.total = 0

    def update(self, val):
        self.total += val
        self.steps += 1

    def __call__(self):
        return self.total / float(self.steps)

In [8]:
def train(model, optimizer, loss_fn, data_iterator, metrics, args, num_steps):

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    summ = []
    loss_avg = RunningAverage()
    
    # Use tqdm for progress bar
    t = tnrange(num_steps) 
    for i in t:
        # fetch the next training batch
        train_batch, labels_batch = next(data_iterator)

        # compute model output and loss
        output_batch = model(train_batch)
        loss = loss_fn(output_batch, labels_batch)

        # clear previous gradients, compute gradients of all variables wrt loss
        optimizer.zero_grad()
        loss.backward()

        # performs updates using calculated gradients
        optimizer.step()

        # Evaluate summaries only once in a while
        if i % args['save_summary_steps'] == 0:
            # extract data from torch Variable, move to cpu, convert to numpy arrays
            output_batch = output_batch.data.cpu().numpy()
            labels_batch = labels_batch.data.cpu().numpy()

            # compute all metrics on this batch
            summary_batch = {metric:metrics[metric](output_batch, labels_batch)
                             for metric in metrics}
            summary_batch['loss'] = loss.item()
            summ.append(summary_batch)

        # update the average loss
        loss_avg.update(loss.item())
        t.set_postfix(loss='{:05.3f}'.format(loss_avg()))

    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items())
    logging.info("- Train metrics: " + metrics_string)

In [9]:
def evaluate(model, loss_fn, data_iterator, metrics, args, num_steps):

    # set model to evaluation mode
    model.eval()

    # summary for current eval loop
    summ = []

    # compute metrics over the dataset
    for _ in range(num_steps):
        # fetch the next evaluation batch
        data_batch, labels_batch = next(data_iterator)
        
        # compute model output
        output_batch = model(data_batch)
        loss = loss_fn(output_batch, labels_batch)

        # extract data from torch Variable, move to cpu, convert to numpy arrays
        output_batch = output_batch.data.cpu().numpy()
        labels_batch = labels_batch.data.cpu().numpy()

        # compute all metrics on this batch
        summary_batch = {metric: metrics[metric](output_batch, labels_batch)
                         for metric in metrics}
        summary_batch['loss'] = loss.item()
        summ.append(summary_batch)

    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items())
    logging.info("- Eval metrics : " + metrics_string)
    return metrics_mean

In [10]:
def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, args):
    """
        train_data: (dict) training data with keys 'data' and 'labels'
        val_data: (dict) validaion data with keys 'data' and 'labels'
    """
        
    best_val_acc = 0.0

    for epoch in tnrange(args['num_epochs']):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, args['num_epochs']))

        # compute number of batches in one epoch (one full pass over the training set)
        num_steps = (args['train_size'] + 1) // args['batch_size']
        train_data_iterator = data_loader.data_iterator(train_data, args, shuffle=True)
        train(model, optimizer, loss_fn, train_data_iterator, metrics, args, num_steps)
            
        # Evaluate for one epoch on validation set
        num_steps = (args['val_size'] + 1) // args['batch_size']
        val_data_iterator = data_loader.data_iterator(val_data, args, shuffle=False)
        val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, args, num_steps)
        
        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc
            
        # If best_eval, best_save_path        
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

In [11]:
# Load the parameters from json file
args = {
    'data_dir': 'data/kaggle',
    'model_dir': 'experiments/base_model',
    
    "learning_rate": 1e-3,
    "batch_size": 5,
    "num_epochs": 3, # change this to 10 or more

    "hidden_dim": 50,
    "embedding_dim": 50,

    "save_summary_steps": 100,
    
    "train_size": 33570,
    "dev_size": 7194,
    "test_size": 7194,
    "vocab_size": 35180,
    "number_of_tags": 17,
    "pad_word": "<pad>",
    "pad_tag": "O",
    "unk_word": "UNK"
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# use GPU if available
args['cuda'] = torch.cuda.is_available()
    
# Set the random seed for reproducible experiments
torch.manual_seed(6677)
if args['cuda']: torch.cuda.manual_seed(6677)

# load data
data_loader = DataLoader(args['data_dir'], args)
data = data_loader.load_data(['train', 'val'], args['data_dir'])
train_data = data['train']
val_data = data['val']

# specify the train and val dataset sizes
args['train_size'] = train_data['size']
args['val_size'] = val_data['size']

# Define the model and optimizer
model = NER(args['vocab_size'], args['embedding_dim'], args['hidden_dim'], args['number_of_tags']).to(device)
optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'])
    
# fetch loss function and metrics
loss_fn = loss_fn
metrics = metrics


In [12]:
model

NER(
  (embedding): Embedding(35180, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=17, bias=True)
)

In [13]:
train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, args)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6714), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6714), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6714), HTML(value='')))


