In [1]:
import torch
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn import metrics
from spacy.lang.en.stop_words import STOP_WORDS

# settings to make results more reproducible
torch.manual_seed(416)
torch.backends.cudnn.deterministic = True

In [None]:
# specify column properties for text and labels
TEXT = data.Field(tokenize='spacy', 
                  batch_first=True, 
                  include_lengths=True, 
                  lower=True)
LABELS = data.LabelField(batch_first=True)

# only load the submission text and the according judgement for this project
fields = {'selftext': ('text', TEXT), 'link_flair_text': ('label', LABELS)}

# load the data
train, valid, test = data.TabularDataset.splits(path='/content/', 
                                                train='train.csv', 
                                                validation='valid.csv', 
                                                test='test.csv',
                                                format='csv', 
                                                fields=fields)
# view the first example to make sure it was loaded and processed correctly
print(vars(train.examples[0]))

In [None]:
# build vocab for the text data, including UNK and padding tokens
# use pre-trained embeddings to reduce overfitting
TEXT.build_vocab(train, min_freq=3, vectors="glove.6B.50d")

# build label vocab
LABELS.build_vocab(train)

# info on size of vocab and classes
print("Vocab size:",len(TEXT.vocab))
print("Number of classes:",len(LABELS.vocab))

# show the vocab dict and class labels
print(TEXT.vocab.stoi)
print(LABELS.vocab.itos)

In [4]:
# check if cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# choose batch size
BATCH_SIZE = 64

# create iterators for each split
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    shuffle=True,
    device=device)

In [5]:
class AITAClassifier(nn.Module):
    """Bidirectional LSTM classifier
    Parameters:
        embedding_dim: dimensionality of the word embeddings
        hidden_dim: dimensionality of the hidden state of the LSTM
        vocab_size: number of unique tokens in the input data
        num_classes: the number of classes
        num_layers: the number of recurrent layers
        dropout: dropout probability
    
    Attributes:
        embeddings: word embeddings for all words in the vocabulary
        lstm: bidirectional LSTM nettwork
        hidden2tag: map from the hidden state of lstm to the tag space
    """
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes,
                 num_layers, dropout):
        super(AITAClassifier, self).__init__()

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True,
                            bidirectional=True,
                            dropout=dropout)
        
        # hidden_dim * 2 since the network is bidirectional
        self.hidden2tag = nn.Linear(hidden_dim * 2, num_classes)
    
    def forward(self, texts, text_lengths):
        """Carries out the forward pass of the input texts
        Args:
            texts: docs represented as lists of word IDs
            text_lengths: length in words of each doc, for pad/packing
        Returns:
            The log probability distribution for each class for each doc
        """
        embeds = self.word_embeddings(texts)
        packed_embeds = nn.utils.rnn.pack_padded_sequence(embeds, 
                                                          text_lengths, 
                                                          batch_first=True)
        packed_out, (hidden, cell) = self.lstm(packed_embeds)
        # dimensions of hidden: 
        #    (num_layers * num_directions, batch size, hidden dim)

        # get the forward and backward hidden states.
        # in a bidirectional network with >1 layer, these will be the last two
        # layers
        hidden_forward = hidden[-2,:,:]
        hidden_backward = hidden[-1,:,:]

        # concatenate them
        hidden = torch.cat((hidden_forward, hidden_backward), dim = 1)

        class_space = self.hidden2tag(hidden)

        class_scores = F.log_softmax(class_space, dim=1)

        return class_scores

In [None]:
# hyperparameters
EMBEDDING_DIM = 50
HIDDEN_DIM = 32
VOCAB_SIZE = len(TEXT.vocab)
NUM_CLASSES = len(LABELS.vocab)
NUM_LAYERS = 1
DROPOUT = 0
NUM_EPOCHS = 5

# instatiate the model
model = AITAClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES,
                       NUM_LAYERS, DROPOUT)

# initialize pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.word_embeddings.weight.data.copy_(pretrained_embeddings)

# define the loss function and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())

# push to cuda if available
model.to(device)
loss_function.to(device)

# view the model architecture
print(model)

def print_metrics(y_pred, y_true):
    """Prints a summary of precision, recall, and F1 for each class and the
        macro average of all classes
    Args:
        y_pred: list of class predictions given by model
        y_true: list of gold standard classes
    """
    mdict = metrics.classification_report(y_true, y_pred, output_dict=True,
                                          target_names=LABELS.vocab.itos,
                                          zero_division=0)
    for label in LABELS.vocab.itos:
        print(f"    {label:>15}  " 
              f"{mdict[label]['precision']:>9.2f}  " 
              f"{mdict[label]['recall']:>6.2f}  " 
              f"{mdict[label]['f1-score']:>8.2f}  " 
              f"{mdict[label]['support']:>7}")
    print(f"          macro-avg  " 
          f"{mdict['macro avg']['precision']:>9.2f}  " 
          f"{mdict['macro avg']['recall']:>6.2f}  " 
          f"{mdict['macro avg']['f1-score']:>8.2f}  "
          f"{mdict['macro avg']['support']:>7}")

In [None]:
def get_train_loss(model, train_iter, optimizer, loss_function):
    """Do a forward pass over the training data and get the loss
    Args:
        train_iter: iterator for the training data
        optimizer: optimizer to use
        loss_function: loss function to use
    Returns;
        The average loss for the training data
    """
    # initilize loss to 0
    epoch_loss = 0

    # initilize lists for predictions and gold standard for metrics purposes
    y_pred = []
    y_true = []

    # set model to train
    model.train()

    for batch in train_iter:
        # zero out the gradients
        model.zero_grad()

        # get texts and corresponding lengths
        texts, text_lengths = batch.text

        # get predictions and gold standard for batch
        class_scores = model(texts, text_lengths)
        tags = [torch.argmax(x).item() for x in class_scores]
        y_pred.extend(tags)
        y_true.extend(batch.label.tolist())

        # compute loss, gradients, and update parameters
        batch_loss = loss_function(class_scores, batch.label)
        batch_loss.backward()
        optimizer.step()
        epoch_loss += batch_loss.item()
    
    # average the loss across batches
    average_loss = epoch_loss / len(train_iter)

    # print report on loss, precision, recall, and F1
    print(f'  TRAIN loss={average_loss:.3f}' 
          '   precision  recall  f1-score  support')
    print_metrics(y_pred, y_true)

    return average_loss

def get_valid_loss(model, valid_iter, loss_function):
    """Do a forward pass over the validation data and get the loss
    Args:
        valid_iter: iterator for the validation data
        optimizer: optimizer to use
        loss_function: loss function to use
    Returns;
        The average loss for the validation data
    """
    # initilize loss to 0
    epoch_loss = 0

    # initilize lists for predictions and gold standard for metrics purposes
    y_pred = []
    y_true = []

    # set model to eval
    model.eval()

    # don't calculate gradients
    with torch.no_grad():
        for batch in valid_iter:
            # get texts and corresponding lengths
            texts, text_lengths = batch.text

            # get predictions and gold standard for batch
            class_scores = model(texts, text_lengths)
            tags = [torch.argmax(x).item() for x in class_scores]
            y_pred.extend(tags)
            y_true.extend(batch.label.tolist())

            # compute the loss
            batch_loss = loss_function(class_scores, batch.label)
            epoch_loss += batch_loss.item()
    
    # average the loss across batches
    average_loss = epoch_loss / len(train_iter)

    # print report on loss, precision, recall, and F1
    print(f'\n  VALID loss={average_loss:.3f}' 
          '   precision  recall  f1-score  support')
    print_metrics(y_pred, y_true)

    return average_loss

def train_model(model, train_iter, valid_iter, optimizer, loss_function, 
                num_epochs):
    """Train the given model
    Args:
        model: neural net model to train
        train_iter: iterator for the training data
        valid_iter: iterator for the validation data
        optimizer: optimizer to use
        loss_function: loss function to use
        num_epochs: number of epochs to train for
    """
    # initilize best validation loss to infinity
    best_valid_loss = float("inf")

    for epoch in range(num_epochs):
        print('---------------------------------------------------------')
        print(f'EPOCH {epoch+1}\n')

        # get the loss for training and validation sets
        train_loss = get_train_loss(model, train_iter, optimizer, loss_function)
        valid_loss = get_valid_loss(model, valid_iter, loss_function)

        # save model parameters if loss decreased on the validation set
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'model_weights.pt')

# train the model
train_model(model, train_iter, valid_iter, optimizer, loss_function, NUM_EPOCHS)

In [None]:
def evaluate(model, test_iter):
    """Evaluate the model on the test set
    Args:
        model: neural net model to predict with
        test_iter: iterator for the test data
    """
    # initilize lists for predictions and gold standard for metrics purposes
    y_pred = []
    y_true = []

    # set model to eval
    model.eval()

    # don't calculate gradients
    with torch.no_grad():
        for batch in test_iter:
            # get texts and corresponding lengths
            texts, text_lengths = batch.text

            # get predictions and gold standard for batch
            class_scores = model(texts, text_lengths)
            tags = [torch.argmax(x).item() for x in class_scores]
            y_pred.extend(tags)
            y_true.extend(batch.label.tolist())
    
    # print the classification report for the data
    print(metrics.classification_report(y_true, y_pred, 
                                        target_names=LABELS.vocab.itos,
                                        zero_division=0))

# load the saved model parameters
model.load_state_dict(torch.load('/content/model_weights.pt'))

# evaluate the model
evaluate(model, test_iter)