In [1]:
# Load Data
import torch
import torch.nn as nn
from torch import optim
import numpy as np
import random
from datasets import load_dataset
import gensim.downloader as api
import torchmetrics


  from .autonotebook import tqdm as notebook_tqdm


We want to use the whole dataset to do multilabel classification.

multilabel classification is distinct from multiclass. 

In multiclass classification, we want to choose one label from a set of possibilities.

In multilabel classification, we might have multiple 'true' labels for a single example.

A post can be toxic, sarcastic, and obscene.

# Getting the data

In [2]:
def form_input(ex, word_embeddings):
    ex = ex.lower()
    tokenized_ex = list(filter(lambda x: x != '', ex.rstrip().split()))
    vecs = []
    for word in tokenized_ex:
        try:
            vec = word_embeddings[word]
        except KeyError: # this token is not in our embeddings dictionary
            vec = np.zeros(word_embeddings.vector_size)
        vecs.append(vec)
    
    centroid = np.mean(vecs, axis=0)
    
    # we need torch form which is a tensor, not a numpy array
    
    torch_tensor = torch.from_numpy(centroid).float()
    return torch_tensor


def form_outputs(examples):
    ys = examples.select_columns(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
    outputs = []
    for y in ys:
        labels = torch.tensor(list(y.values()), dtype=torch.float)
        outputs.append(labels)
    return outputs

In [3]:
dataset = load_dataset("jigsaw_toxicity_pred", data_dir='../data/jigsaw-toxic-comment-classification-challenge/')


Found cached dataset jigsaw_toxicity_pred (/Users/gabriellachronis/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-e7673256ded46692/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 214.52it/s]


In [4]:
form_outputs(dataset['train'])[:10]

[tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([1., 1., 1., 0., 1., 0.]),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0., 0., 0., 0., 0., 0.]),
 tensor([0., 0., 0., 0., 0., 0.])]

In [5]:
# download the model and return as object ready for use
# word_embeddings = api.load("glove-twitter-25")
word_embeddings = api.load("glove-twitter-100")



# Building the Pytorch Model

Here is an example FFNN

here is our old model and training loop for binary classification

In [6]:
class FFNN(nn.Module):
    """
    Defines the core neural network for doing multiclass classification over a single datapoint at a time. This consists
    of matrix multiplication, tanh nonlinearity, another matrix multiplication, and then
    a log softmax layer to give the ouputs. Log softmax is numerically more stable. If you take a softmax over
    [-100, 100], you will end up with [0, 1], which if you then take the log of (to compute log likelihood) will
    break.

    The forward() function does the important computation. The backward() method is inherited from nn.Module and
    handles backpropagation.
    """
    def __init__(self, word_embeddings, inp, hid, out):
        """
        Constructs the computation graph by instantiating the various layers and initializing weights.

        :param inp: size of input (integer)
        :param hid: size of hidden layer(integer)
        :param out: size of output (integer), which should be the number of classes
        """
        super(FFNN, self).__init__()
        self.V = nn.Linear(inp, hid)
        # self.g = nn.Tanh()
        self.g = nn.ReLU()
        self.W = nn.Linear(hid, out)

        self.sigmoid = nn.Sigmoid()
        
        # Initialize weights according to a formula due to Xavier Glorot.
        nn.init.xavier_uniform_(self.V.weight)
        nn.init.xavier_uniform_(self.W.weight)

        # Initialize with zeros instead
        # nn.init.zeros_(self.V.weight)
        # nn.init.zeros_(self.W.weight)
        
        self.num_classes = out
        self.loss = nn.BCELoss()
        self.word_embeddings = word_embeddings

    def forward(self, x):
        """
        Runs the neural network on the given data and returns log probabilities of the various classes.

        :param x: a [inp]-sized tensor of input data
        :return: an [out]-sized tensor of log probabilities. (In general your network can be set up to return either log
        probabilities or a tuple of (loss, log probability) if you want to pass in y to this function as well
        """
        return self.sigmoid(self.W(self.g(self.V(x))))

    def predict(self,example) -> int:
        x = form_input(example, self.word_embeddings)
        log_probs = self.forward(x)
        return log_probs
    
    

# Evaluate

We use AUROC as our primary evaluation metric. Accuracy is not a faithful metric for data as imbalanced as ours, but we still report on it. 

In [7]:
from torchmetrics.classification import MultilabelAUROC
from torchmetrics.classification import MultilabelAccuracy

def print_evaluation(golds, predictions):
    """
    Prints evaluation statistics comparing golds and predictions, each of which is a sequence of 0/1 labels.
    Prints accuracy as well as precision/recall/F1 of the positive class, which can sometimes be informative if either
    the golds or predictions are highly biased.

    :param golds: gold labels, list of ints
    :param predictions: pred labels, list of ints
    :return:
    """
    #print(golds)
    #print(predictions)
    accuracy = MultilabelAccuracy(num_labels=6, threshold=0.5)
    acc = accuracy(predictions, golds)
    
    metric = MultilabelAUROC(num_labels=6, average="macro", thresholds=None)
    auroc = metric(predictions, golds)
    output_str = ""
    output_str += ";\nAUROC: %f;\n" % auroc
    output_str += ";\nAccuracy: %f;\n" % acc

    #output_str += ";\nPrecision (fraction of predicted positives that are correct): %i / %i = %f" % (num_pos_correct, num_pred, prec)
    #output_str += ";\nRecall (fraction of true positives predicted correctly): %i / %i = %f" % (num_pos_correct, num_gold, rec)
    #output_str += ";\nF1 (harmonic mean of precision and recall): %f;\n" % f1
    print(output_str)
    return metric, auroc



# Lab: Coding the Training Loop

Your task will be to write the training loop for training the feed forward neural network.


The next cell constructs the training data and test data and sets hyperparameters. Feel free to alter hyperparameters to try improving the score!


In [8]:
# load training data

train_exs = dataset['train']
print(train_exs)
dev_exs = dataset['test']
test_exs = dataset['test']
print(repr(len(train_exs)) + " / " + repr(len(dev_exs)) + " / " + repr(len(test_exs)) + " train/dev/test examples")


# get the data in the right shape
train_xs = train_exs['comment_text']
train_xs = [form_input(x, word_embeddings) for x in train_xs]
train_ys = form_outputs(train_exs)

print(train_ys[0])

dev_xs = dev_exs['comment_text']
dev_ys = form_outputs(dev_exs)

# set hyperparameters

num_epochs = 10
hidden_size = 200
#lr = 0.001
#lr = 0.1
initial_learning_rate = 0.001


Dataset({
    features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
    num_rows: 159571
})
159571 / 63978 / 63978 train/dev/test examples


NameError: name 'word_embeddings' is not defined

In [None]:
## training loop for toxicity classification

num_classes = 6

ffnn = FFNN(word_embeddings, word_embeddings.vector_size, hidden_size, num_classes)
optimizer = optim.Adam(ffnn.parameters(), lr=initial_learning_rate)


for epoch in range(0, num_epochs):
    ex_indices = [i for i in range(0, len(train_xs))]
    random.shuffle(ex_indices)
    total_loss = 0.0
    
    ffnn.train()
    for idx in ex_indices:
        
        x = train_xs[idx]
        y = train_ys[idx]

        
        # Zero out the gradients from the FFNN object. *THIS IS VERY IMPORTANT TO DO BEFORE CALLING BACKWARD()*
        ffnn.zero_grad()
        y_hat = ffnn.forward(x)
        
        # Can also use built-in NLLLoss as a shortcut here but we're being explicit here
        loss = ffnn.loss(y_hat, y)
        total_loss += loss
        
        # Computes the gradient and takes the optimizer step
        loss.backward()
        optimizer.step()
    print("Total loss on epoch: %f" % (total_loss))
    
    ffnn.eval()
    dev_y_hats = [ffnn.predict(ex) for ex in dev_xs]
    #print(dev_ys)
    #print(dev_y_hats)
    print_evaluation(torch.stack(dev_ys).type(torch.int), torch.stack(dev_y_hats))



In [None]:
dev_ys[0].type()
#dev_y_hats[0].type()

# Improving the Model with Features

Does the model do better with the features that you used in the binary classification task?

In [None]:
"""

"""

# Appendix: Example Training Loop

Here is an example training loop for learning the XOR function

In [None]:
# MAKE THE DATA
# Synthetic data for XOR: y = x1 XOR x2
train_xs = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32)
train_ys = np.array([0, 1, 1, 0], dtype=np.float32)

# Define some constants
# Inputs are of size 2
feat_vec_size = 2

# Let's use 4 hidden units
embedding_size = 4

# We're using 2 classes. What's presented here is multi-class code that can scale to more classes, though
# slightly more compact code for the binary case is possible.
num_classes = 2


# set hyperparameters
num_epochs = 100
ffnn = FFNN(feat_vec_size, embedding_size, num_classes)
initial_learning_rate = 0.1
optimizer = optim.Adam(ffnn.parameters(), lr=initial_learning_rate)


# RUN TRAINING
for epoch in range(0, num_epochs):
    
    ex_indices = [i for i in range(0, len(train_xs))]
    random.shuffle(ex_indices)
    total_loss = 0.0

    for idx in ex_indices:
        x =  torch.from_numpy(train_xs[idx]).float()
        y = train_ys[idx]
        # Build one-hot representation of y. Instead of the label 0 or 1, y_onehot is either [0, 1] or [1, 0]. This
        # way we can take the dot product directly with a probability vector to get class probabilities.
        y_onehot = torch.zeros(num_classes)
        
        # scatter will write the value of 1 into the position of y_onehot given by y
        y_onehot.scatter_(0, torch.from_numpy(np.asarray(y,dtype=np.int64)), 1)
        # Zero out the gradients from the FFNN object. *THIS IS VERY IMPORTANT TO DO BEFORE CALLING BACKWARD()*
        ffnn.zero_grad()
        log_probs = ffnn.forward(x)
        
        # Can also use built-in NLLLoss as a shortcut here but we're being explicit here
        loss = torch.neg(log_probs).dot(y_onehot)
        
        total_loss += loss
        # Computes the gradient and takes the optimizer step
        loss.backward()
        optimizer.step()
    print("Total loss on epoch %i: %f" % (epoch, total_loss))
    

# Evaluate on the train set
# RUN TRAINING AND TEST

train_correct = 0
for idx in range(0, len(train_xs)):
    x = form_input(train_xs[idx])
    y = train_ys[idx]
    log_probs = ffnn.forward(x)
    prediction = torch.argmax(log_probs)
    if y == prediction:
        train_correct += 1
    print("Example " + repr(train_xs[idx]) + "; gold = " + repr(train_ys[idx]) + "; pred = " +\
          repr(prediction) + " with probs " + repr(log_probs))
print(repr(train_correct) + "/" + repr(len(train_ys)) + " correct after training")