 # Logistic Regression with Pytorch

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import random
import os
from collections import namedtuple

In [None]:
np.random.seed(42)
torch.manual_seed(42)

## Brief Pytorch API summary
- The interface is very similar to numpy, operations are based on Tensors, which are roughly similar to numpy ndarrays.
- Most operators return a new tensor after computing, but some can be applied 'in-place', meaning that the operations are performed over the object. These functions generally have an underscore in their name.   
- Simple operations, such as addition and product, are overloaded into Python syntax.
- More complex functions are accessible using the API, having [pytorch.org/docs](http://pytorch.org/docs) in the background when developing is always recommended.  

In [None]:
x = torch.Tensor([[1,2], [3,4]])
print(x)
print(x.size())

In [None]:
print(x+1)
x.add_(1)
print(x)

- Tensors can be transposed, reshaped and manipulated according to our needs. This is mainly accomplished using the `transpose()` and `view()` functions. 
- These reshapings are generally needed when we use the Pytorch API for neural nets, which makes certain assumptions about the shape of the input data. 

In [None]:
print(x)
print(x.transpose(0, 1))
print(x.view(-1))

- the `torch.Tensor` object implicitly represents one node in the computational graph, meaning that all operations we perform over `Tensor` objects will be recorded, unless we state otherwise. In this way,  we can later go through the graph created implicitly and use it for backpropagation. 
- Every `Tensor` object contains a`.grad` attribute holding the current value of the gradient, if any. 

- By simply calling the `.to(device)` function on any Tensor or Module, we can easily move our objects to the GPU/CPU.

In [None]:
device = torch.device('cuda')
x.to(device=device)

- Nodes in the graph should implement both the `forward()` and `backward()` functions, making them suitable to use when training models using backprop.

## Loading and understanding the SemEval 2018 Task 3 Dataset

In [None]:
# download the SemEval 2018 Task 3 Dataset
# the task is binary classification: ironic or not ironic tweet
! wget https://raw.githubusercontent.com/Cyvhee/SemEval2018-Task3/master/datasets/train/SemEval2018-T3-train-taskA.txt
! wget https://raw.githubusercontent.com/Cyvhee/SemEval2018-Task3/master/datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt

In [None]:
# let's set some parameters

train_path = 'SemEval2018-T3-train-taskA.txt' 
test_path = 'SemEval2018-T3_gold_test_taskA_emoji.txt'

batch_size = 32
max_len = 300
min_count = 0
device = torch.device('cuda')

## Load the dataset
- 3834 train tweets
- 784 test tweets

In [None]:
Sentence = namedtuple('Sentence', ['index', 'tokens', 'label'])

def read_semeval_2018_task_3_dataset(dataset_file_path):
    
    sentences = []
    
    with open(dataset_file_path) as f:
        # skip header
        f.readline()
        for line in f.readlines(): 
            if line:
                index, label, text = line.strip().split('\t')
                sentence = Sentence(index, text.split(), label)
                sentences.append(sentence)

    return sentences

In [None]:
train_examples = read_semeval_2018_task_3_dataset(train_path)
test_examples = read_semeval_2018_task_3_dataset(test_path)

print(len(train_examples))
print(len(test_examples))

## Mapping our words to unique identifiers: the Vocabulary object
- We will create an object to manage a mapping between words (or more generally tokens) and unique indices. 
- There are a few special symbols that we will be adding to handle special cases.
  - The first key special case is the `UNK` token, wich will represent all tokens that we do not have in our vocabulary. This is needed as we will build our vocabulary only using the training examples, and during validation or testing (or if we deploy our model in production) we may encounter new words that also need to be represented somehow.
  - The `PAD` token, which we will use to create even-sized batches of sentences of different length when using RNNs. 
  - The beginning-of-sentence or `BOS` token, which we may use to denote the beginning of a sentence in some special cases
  - The end-of-sentence or `EOS` token, which as in the previous case is useful for certain tasks.
  

In [None]:
# Define the string of special tokens we will need 
UNK = '<UNK>'
PAD = '<PAD>'
BOS = '<BOS>'
EOS = '<EOS>'


class VocabItem:

    def __init__(self, string, hash=None):
        """
        Our token object, representing a term in our vocabulary.
        """
        self.string = string
        self.count = 0
        self.hash = hash

    def __str__(self):
        """
        For pretty-printing of our object
        """
        return 'VocabItem({})'.format(self.string)

    def __repr__(self):
        """
        For pretty-printing of our object
        """
        return self.__str__()


class Vocab:

    def __init__(
        self,
        min_count=0,
        no_unk=False,
        add_padding=False,
        add_bos=False,
        add_eos=False,
        unk=None
    ):

        """
        :param min_count: The minimum frequency count threshold for a token
                          to be added to our mapping. Only useful if
                          the unk parameter is None.

        :param add_padding: If we should add the special `PAD` token.

        :param add_bos: If we should add the special `BOS` token.

        :param add_eos: If we should add the special `EOS` token.

        :param no_unk: If we should not add the `UNK` token to our Vocab.

        :param unk: A string with the unknown token, in case our
                    sentences have already been processed for this,
                    or `None` to use our default `UNK` token.
        """

        self.no_unk = no_unk
        self.vocab_items = []
        self.vocab_hash = {}
        self.word_count = 0
        self.special_tokens = []
        self.min_count = min_count
        self.add_padding = add_padding
        self.add_bos = add_bos
        self.add_eos = add_eos
        self.unk = unk

        self.UNK = None
        self.PAD = None
        self.BOS = None
        self.EOS = None

        self.index2token = []
        self.token2index = {}

        self.finished = False

    def add_tokens(self, tokens):
        if self.finished:
            raise RuntimeError('Vocabulary is finished')

        for token in tokens:
            if token not in self.vocab_hash:
                self.vocab_hash[token] = len(self.vocab_items)
                self.vocab_items.append(VocabItem(token))

            self.vocab_items[self.vocab_hash[token]].count += 1
            self.word_count += 1

    def finish(self):

        token2index = self.token2index
        index2token = self.index2token

        tmp = []

        if not self.no_unk:

            # we add/handle the special `UNK` token
            # and set it to have index 0 in our mapping
            if self.unk:
                self.UNK = VocabItem(self.unk, hash=0)
                self.UNK.count = self.vocab_items[self.vocab_hash[self.unk]].count
                index2token.append(self.UNK)
                self.special_tokens.append(self.UNK)

                for token in self.vocab_items:
                    if token.string != self.unk:
                        tmp.append(token)

            else:
                self.UNK = VocabItem(UNK, hash=0)
                index2token.append(self.UNK)
                self.special_tokens.append(self.UNK)

                for token in self.vocab_items:
                    if token.count <= self.min_count:
                        self.UNK.count += token.count
                    else:
                        tmp.append(token)
        else:
            for token in self.vocab_items:
                tmp.append(token)

        # we sort our vocab. items by frequency
        # so for the same corpus, the indices of our words
        # are always the same
        tmp.sort(key=lambda token: token.count, reverse=True)

        # we always add our additional special tokens
        # at the end of our mapping
        if self.add_bos:
            self.BOS = VocabItem(BOS)
            tmp.append(self.BOS)
            self.special_tokens.append(self.BOS)

        if self.add_eos:
            self.EOS = VocabItem(EOS)
            tmp.append(self.EOS)
            self.special_tokens.append(self.EOS)

        if self.add_padding:
            self.PAD = VocabItem(PAD)
            tmp.append(self.PAD)
            self.special_tokens.append(self.PAD)

        index2token += tmp

        # we update the vocab_hash for each
        # VocabItem object in our list
        # based on their frequency
        for i, token in enumerate(self.index2token):
            token2index[token.string] = i
            token.hash = i

        self.index2token = index2token
        self.token2index = token2index

        if not self.no_unk:
            print('Unknown vocab size:', self.UNK.count)

        print('Vocab size: %d' % len(self))

        self.finished = True

    def __getitem__(self, i):
        return self.index2token[i]

    def __len__(self):
        return len(self.index2token)

    def __iter__(self):
        return iter(self.index2token)

    def __contains__(self, key):
        return key in self.token2index

    def tokens2indices(self, tokens, add_bos=False, add_eos=False):
        """
        Returns a list of mapping indices by processing the given string
        with our `tokenizer` and `token_function`, and defaulting to our
        special `UNK` token whenever we found an unseen term.

        :param string: A sentence string we wish to map into our vocabulary.

        :param add_bos: If we should add the `BOS` at the beginning.

        :param add_eos: If we should add the `EOS` at the end.

        :return: A list of ints, with the indices of each token in the
                 given string.
        """
        string_seq = []
        if add_bos:
            string_seq.append(self.BOS.hash)
        for token in tokens:
            if self.no_unk:
                string_seq.append(self.token2index[token])
            else:
                string_seq.append(self.token2index.get(token, self.UNK.hash))
        if add_eos:
            string_seq.append(self.EOS.hash)
        return string_seq

    def indices2tokens(self, indices, ignore_ids=()):
        """
        Returns a list of strings by mapping back every index to our
        vocabulary.

        :param indices: A list of ints.

        :param ignore_ids: An itereable with indices to ignore, meaning
                           that we will not look for them in our mapping.

        :return: A list of strings.

        Will raise a KeyException whenever we pass an index that we
        do not have in our mapping, except when provided with `ignore_ids`.

        """
        tokens = []
        for idx in indices:
            if idx in ignore_ids:
                continue
            tokens.append(self.index2token[idx].string)

        return tokens

- Now we can instance our vocabulary objects and add the data.
- We will use one vocabulary for the input data (the sentences), and another vocabulary object for the output data, the class labels. In this way our code is generic and should work out-of-the-box for any number of output labels.

In [None]:
# for the input vocabulary
# we add the `UNK` special token to handle unseen data
# we do not the padding, so we skip it (more on this below) 
src_vocab = Vocab(min_count=0, add_padding=False)

# for the output vocabulary
# we do not need the `UNK` token (we do not want an UNK class)
# we do not the padding either (more on this below) 
tgt_vocab = Vocab(no_unk=True, add_padding=False)

In [None]:
for sentence in train_examples:
    src_vocab.add_tokens(sentence.tokens[:max_len])
    tgt_vocab.add_tokens([sentence.label])

src_vocab.finish()
tgt_vocab.finish()

In [None]:
src_vocab.tokens2indices('the movie was bad'.split())

In [None]:
Vocabs = namedtuple('Vocabs', ['src', 'tgt'])
vocabs = Vocabs(src_vocab, tgt_vocab)

## Representing words using one-hot vectors
- The building block for classic NLP in terms of representing words is the one-hot vector. 
- One-hot vectors are sparse vectors whose dimension is equivalent to the size of the vocabulary. To create the vector representation of a word we start a with a vector of zeros and simply put a 1 at the index corresponding to that word, according to our vocabulary.

In [None]:
def one_hot(labels, num_classes):
    input_size = len(labels)
    labels = np.array(labels)
    matrix = np.zeros((input_size, num_classes), dtype=np.float32)    
    matrix[np.arange(input_size), labels] = 1
    return matrix

## The Batch object
 - To easily access all the data in a batch, let's create a special Batch object that will give us access to all the information we may require during training. 
 - This object will work like a dictionary, but it will also allow us to access each component using an attribute with the same name.
  The main principle is that this dictionary-like batch will hold `numpy` objects as values, and that after calling the `to_torch_()` function, they will be turned into `pytorch` objects and moved to the corresponding provided device. In this way, we know that all our elements inside the batch object are in the right place.

In [None]:
class Batch(dict):
    def __init__(self, *args, **kwargs):
        super(Batch, self).__init__(*args, **kwargs)
        self.__dict__ = self
        self._is_torch = False

    def to_torch_(self, device):
        self._is_torch = False
        for key in self.keys():
            value = self[key]
            if isinstance(value, np.ndarray):
                self[key] = torch.from_numpy(value).to(device)

## The BatchBuilder object
- Finally, let's create an object to help us transform our text data into tensors with information that can be fed into our model. This object will do all the heavy-lifting, turning our string examples into our batch objects, which PyTorch can later handle.
- We will combine this object with the `DataLoader` util from `pytorch`, using as a function for the [`collate_fn` parameter](https://pytorch.org/docs/stable/data.html#working-with-collate-fn), which allows us to provide a custom function to create this funcion. In our case, this is achieved by implementing the `__call__` function in the `BatchBuilder` object, which will esentally turn the [instance into a function](https://docs.python.org/3/reference/datamodel.html#emulating-callable-objects).
- In this case, to represent each input sentence in a batch we will use the **sum of its one-hot vectors**, which is the de facto input for logistic regressions. There is no need for padding since each sentence will be compressed into a vector that is the same size.

In [None]:
class LogisticRegressionBatchBuilder(object):
    # Because the `__call__` function needs to only recieve 
    # one parameter (due to restrictions of the `DataLoader`
    # we can use the constructor we can pass any additional
    # inputs we may require when building our batches
    def __init__(self, vocabs, max_len=None):
        self.vocabs = vocabs
        self.max_len = max_len
    
    # This will the "unction called by the `DataLoader` object
    # that only accepts the `examples` parameter
    def __call__(self, examples):

        ids_batch = [int(sentence.index) for sentence in examples]

        src_examples = [
            self.vocabs.src.tokens2indices(sentence.tokens[: self.max_len])
            for sentence in examples
        ]

        tgt_examples = [
            self.vocabs.tgt.token2index[sentence.label]
            for sentence in examples
        ]

        src_examples_one_hot = [
            one_hot(src_example, len(self.vocabs.src))
            for src_example in src_examples
        ]

        src_batch = np.vstack(
            [item.sum(0) for item in src_examples_one_hot]
        )

        tgt_batch = np.asarray(tgt_examples, dtype=np.int64)
        
        # we return our Batch "custom object"
        return Batch(
            indices=ids_batch,
            src=src_batch,
            tgt=tgt_batch)

- Now we can just instance our `BatchBuilderObject` and provide it to the `DataLoader` as a `collate_fn`parameter.

In [None]:
batch_builder = LogisticRegressionBatchBuilder(
    vocabs,
    max_len=max_len
)

train_batches = DataLoader(
    train_examples,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=batch_builder,
)

test_batches = DataLoader(
    test_examples,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    collate_fn=batch_builder,
)

## The Pytorch Model

### Logistic Regression
- Let's start by setting the hyper parameters of our yet-to-define model.

In [None]:
# hyper-parameters 
input_size = len(src_vocab)
num_classes = len(tgt_vocab)
epochs = 20
learning_rate = 0.5
log_interval = 100
device = torch.device('cuda')

- Our next step is to define the model. To do so, we can extend the `torch.nn.Module` class, which will alow us to reuse some of the internal structure that Pytorch has prepared. 
- We need to define the `__init__()` and `forward()` functions which will take care of initializing the parameters of our model and computing the outputs given an example, respectively.
- As long as we use Pytorch objects and operations, we do not have to define the `backward()` function ourselves!

In [None]:
# model
class LogisticRegression(nn.Module):
    
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        out = self.linear(x)
        return out

In [None]:
nn.Linear?

- `nn.Linear()` offers us a shortcut for defining single layer neural networks of the form $y = Ax  + b$ , following all the good practices of parameter initialization.
- Note that the API for `nn.Linear()` expects a tensor of size `(N_examples, n_features)`, where generally `N_features` indicates that we could pass a *mini-batch* of examples to the model at the same time. 

- Next, let's instantiate our model and explore it.

In [None]:
model = LogisticRegression(input_size, num_classes)
model = model.to(device=device)
  
print(model)

Note that the parameters created by our Model object as just regular PyTorch Tensors wrapped in the `Parameter` object.

In [None]:
print(model.linear.weight)
print(model.linear.bias)

Now we are going to set the adequate loss function for our problem, and prepare us to  use stochastic gradient descent as our training algorithm.

In [None]:
loss_function = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

In [None]:
nn.CrossEntropyLoss?

- `nn.CrossEntropy()` is a Pytorch efficient implementation of the cross entropy, which is the loss funtion of preference for multi-class classification. Note that it expects two inputs: a tensor of size `(N_examples, N_classes)` containing the logits (non-normalized probabilities) for each class on each instance, and the hard label for the batch of shape `(N_examples)`.
- The function `model.parameters()` returns an iterable over the paramerets of the model, which are simply tensors wrapped in the special `nn.Parameter()` object.
- The `torch.optim.SGD()` object receives our model parameters and the learning rate, and is in charge of simply updating these using the gradients and the update rule that we are familiar with: $\theta \leftarrow \theta - \alpha * \nabla \theta$, where $\alpha$ is our learning rate and $\theta$ symbolizes our model parameters.

- Now we can train our model using all the components we've built above.

In [None]:
# train the model

for epoch in range(epochs):
    
    epoch_correct = 0
    epoch_total = 0
    epoch_loss = 0
    i = 0
    
    model.train()
  
    for train_batch in train_batches:
        
        # we move our data to PyTorch
        # and to the GPU if necessary
        train_batch.to_torch_(device)
        
        # make sure our gradients are 0 to start
        optimizer.zero_grad()
        
        # call forward() to compute the 
        # outputs of the model given our examples
        outputs = model(train_batch.src)
        
        # compute the loss and call backward()
        # to compute gradients
        loss = loss_function(outputs, train_batch.tgt)
        loss.backward()
        
        # apply our learning rule using the gradients
        # stored in the parameters
        optimizer.step()
        
        _, predictions = outputs.max(1)
        
        correct = (predictions == train_batch.tgt).long().sum()
        total = train_batch.tgt.size(0)
        epoch_correct += correct.item()
        epoch_total += total
        epoch_loss += loss.item()
        i += 1

    accuracy  = 100 * epoch_correct / epoch_total
    
    print('Epoch {}'.format(epoch))
    print('Train Loss: {}'.format(epoch_loss / len(train_batches)))
    print('Train Accuracy: {}'.format(accuracy))

    test_epoch_correct = 0
    test_epoch_total = 0
    test_epoch_loss = 0

    model.eval()
    
    for test_batch in test_batches:
        
        # we move our data to PyTorch
        # and to the GPU if necessary
        test_batch.to_torch_(device)
        
        # call forward() to compute the 
        # outputs of the model given our examples
        outputs = model(test_batch.src)
        
        loss = loss_function(outputs, test_batch.tgt)
        
        _, predictions = outputs.max(1)
        
        correct = (predictions == test_batch.tgt).long().sum()
        total = test_batch.tgt.size(0)
        test_epoch_correct += correct.item()
        test_epoch_total += total
        test_epoch_loss += loss.item()

    test_accuracy = 100 * test_epoch_correct / test_epoch_total

    print('\n---------------------')
    print('Test Loss: {}'.format(test_epoch_loss / len(test_batches)))
    print('Test Accuracy: {}'.format(test_accuracy))
    print('---------------------\n')

In [None]:
# save the trained model (a.k.a model parameters)
torch.save(model.state_dict(), 'model.pth')