In [1]:
import torch
from torchtext.datasets import SST2

# Use GPU if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Split SST2 dataset into training and evaluation using pre-built markers.
train_data = SST2(split="train")
eval_data = SST2(split="dev")


### Tokenize dataset 

In [2]:
from transformers import BertTokenizer

# Using a pre-built tokenizer from HuggingFace
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# SST2 comes as pairs of sentences and sentiment labels, split these up.
sentences = []
labels = []
for text, label in train_data:
    sentences.append(text)
    labels.append(label)

# Apply tokenizer to split sentences into tokens, most commonly individual words.
tokenized_texts = [tokenizer.tokenize(sentence) for sentence in sentences]

# Set a max length for tensor input into model.
MAX_LEN = 128
# Convert tokens into numeric values "input_ids".
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Create fill the first values of the input tensor with the word values.
input_ids = torch.tensor(
    [ids[:MAX_LEN] + [0] * (MAX_LEN - len(ids)) for ids in input_ids])

# Create attention mask to distinguish word values from blank padding values, 
# in the case that sentence is shorter than 128 tokens.  
attention_masks = torch.tensor(
    [[1 if token_id > 0 else 0 for token_id in ids] for ids in input_ids])

# Convert labels to a tensor for ingestion into model as well.
labels = torch.tensor(labels)
len(input_ids)

  from .autonotebook import tqdm as notebook_tqdm


67349

In [13]:
len(tokenizer)

30522

### Repeat process for evaluation dataset

In [3]:
eval_sentences = []
eval_labels = []

for text, label in eval_data:
    eval_sentences.append(text)
    eval_labels.append(label)

eval_tokenized_texts = [tokenizer.tokenize(
    sentence) for sentence in eval_sentences]

MAX_LEN = 128
eval_input_ids = [tokenizer.convert_tokens_to_ids(
    x) for x in eval_tokenized_texts]
eval_input_ids = torch.tensor(
    [ids[:MAX_LEN] + [0] * (MAX_LEN - len(ids)) for ids in eval_input_ids])

eval_attention_masks = torch.tensor(
    [[1 if token_id > 0 else 0 for token_id in ids] for ids in eval_input_ids])
eval_labels = torch.tensor(eval_labels)

len(eval_input_ids)

872

### Create dataloaders to feed data to model in batches

In [4]:
from torch.utils.data import TensorDataset, DataLoader

# PyTorch's TensorDataset combines all relavent items into a single tensor.
train_dataset = TensorDataset(input_ids, attention_masks, labels)
eval_dataset = TensorDataset(eval_input_ids, eval_attention_masks, eval_labels)

BATCH_SIZE = 16

# A dataloader feeds the data from the TensorDataset into the model in batches.
train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=True)

### Define neural network (NN) architecture

In [9]:
from torch import nn

'''class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, attention_mask):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        masked_output = output * attention_mask.unsqueeze(-1)
        attention_weights = torch.softmax(masked_output, dim=1)
        attention_output = torch.sum(attention_weights * output, dim=1)
        output = self.fc(attention_output)
        return output.squeeze()'''


class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size=30522, embed_dim=512, hidden_dim=256, num_classes=1, num_lstm_layers=4, dropout_prob=0.2):
        super(TextClassificationModel, self).__init__()
        # Embedding layers convert word values into more complicated vectors.
        # This is a type of feature extraction or definition.
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # Long short-term memory layer. An abstracted layer that contains 
        # several gates that perform different tasks on sequential data.
        # Common in natural language processing tasks. 
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_lstm_layers, bidirectional=True, dropout=dropout_prob)
        
        # Fully connected layer, maps NN output thusfar to final classification.
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        
        # Initialize the NN with random weights and biases.
        self.init_weights()

    def init_weights(self):
        # Sets weights and biases of NN layers to random values to start.
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, attention_mask):
        # Defines the way data moves through the NN. Utilizes the attention mask
        # that was developed for each sentence in the tokenizing step.

        # Pass text through the embedding layer.
        embedded = self.embedding(text)
        
        # Pass text through lstm layers.
        output, (hidden, cell) = self.lstm(embedded)

        # Apply attention mask to output of embedding and lstm layers.
        # Output has dimension of 512, so unsqueeze to match this.
        masked_output = output * attention_mask.unsqueeze(-1)
        attention_output = torch.sum(masked_output * output, dim=1)

        # Pass attention weighted output through fc layer to produce final logits.
        output = self.fc(attention_output)
        return output.squeeze()

### Define model hyperparameters and initialize model, criterion, and optimizer

In [10]:
# Learning rate, the rate at which NN parameters are updated.
LR = 0.001
# The number of nodes in each embedded layer.
EMBED_DIM = 512
# The number of nodes in each hidden layer.
HIDDEN_DIM = 256
# The number of classes for final classification. Sentiment analysis is a 
# binary classification, so just 1 class.
NUM_CLASSES = 1

# Init model and send to GPU if possible.
model = TextClassificationModel()
model.to(DEVICE)

# Criterion, or loss function, measures how close the NN's predictions are to 
# to the labels.
criterion = nn.BCEWithLogitsLoss() .to(DEVICE)
# Optimizer updates the NN's weights and biases to reduce loss as calculated by 
# criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

### Function to compare model output to SST2 labels

In [7]:
def calculate_accuracy(output, label):
    # Squeeze output into range(0, 1), then round to either 0 or 1
    rounded_output = torch.round(torch.sigmoid(output))
    # Compare rounded output values to labels
    correct = (rounded_output == label).float()  # convert into float for division
    accuracy = correct.sum() / len(labels)
    return accuracy

### Training loop

In [11]:
num_epochs = 10
epoch_accuracy = []

# Run the model through the training dataset num_epoch times.
for epoch in range(num_epochs):
    # Place the model in training mode, allowing it to adjust parameters.
    model.train()
    total_accuracy = 0
    
    # Grab each batch from the dataloader.
    for index, (input_ids, attention_masks, labels) in enumerate(train_dataloader):
        # Send all data elements to GPU.
        input_ids = input_ids.to(DEVICE)
        attention_masks = attention_masks.to(DEVICE)
        labels = labels.to(DEVICE)

        # This clears the optimizers gradient values before adjusting parameters 
        # based on the loss performance of the current batch.
        optimizer.zero_grad()

        # Run the data through the model.
        output = model(input_ids, attention_masks)

        # Calculate loss.
        loss = criterion(output.squeeze(), labels.float())

        # Calculate accuracy (just for visual reporting)
        accuracy = calculate_accuracy(output, labels)

        # This aquires the relationship between the NN's parameters and the loss
        # that was calculated previously.
        loss.backward()
        
        # Adjust parameters to reduce loss, based on learning rate. 
        optimizer.step()

        # Accumulate accuracy calculation for this batch.
        total_accuracy += accuracy.item()

        # Print nice report for each batch's performance.
        print(f"Epoch {epoch+1}/{num_epochs} | Batch {index}/{len(train_dataloader)} | Accuracy: {accuracy.item()*100:.2f}%")

    # Track accuracy rating for each epoch.
    epoch_accuracy.append(f'{total_accuracy / len(train_dataloader)}')


for i in range(len(epoch_accuracy)):
    print(f'Accuracy for Epoch {i}: {epoch_accuracy[i]}')

Epoch 1/10 | Batch 0/4210 | Accuracy: 62.50%
Epoch 1/10 | Batch 1/4210 | Accuracy: 25.00%
Epoch 1/10 | Batch 2/4210 | Accuracy: 56.25%
Epoch 1/10 | Batch 3/4210 | Accuracy: 68.75%
Epoch 1/10 | Batch 4/4210 | Accuracy: 50.00%
Epoch 1/10 | Batch 5/4210 | Accuracy: 31.25%
Epoch 1/10 | Batch 6/4210 | Accuracy: 37.50%
Epoch 1/10 | Batch 7/4210 | Accuracy: 43.75%
Epoch 1/10 | Batch 8/4210 | Accuracy: 62.50%
Epoch 1/10 | Batch 9/4210 | Accuracy: 43.75%
Epoch 1/10 | Batch 10/4210 | Accuracy: 56.25%
Epoch 1/10 | Batch 11/4210 | Accuracy: 37.50%
Epoch 1/10 | Batch 12/4210 | Accuracy: 68.75%
Epoch 1/10 | Batch 13/4210 | Accuracy: 37.50%
Epoch 1/10 | Batch 14/4210 | Accuracy: 50.00%
Epoch 1/10 | Batch 15/4210 | Accuracy: 81.25%
Epoch 1/10 | Batch 16/4210 | Accuracy: 37.50%
Epoch 1/10 | Batch 17/4210 | Accuracy: 68.75%
Epoch 1/10 | Batch 18/4210 | Accuracy: 31.25%
Epoch 1/10 | Batch 19/4210 | Accuracy: 50.00%
Epoch 1/10 | Batch 20/4210 | Accuracy: 18.75%
Epoch 1/10 | Batch 21/4210 | Accuracy: 50.00

In [12]:
# Set model into evaluation mode, not allowing parameter changes.
model.eval()

# Track correct classifications for total dataset accuracy calculation.
corrects = 0
total = 0

for input_ids, attention_masks, labels in eval_dataloader:
    # Send all data elements to GPU. Call model on indidual batches.
    input_ids = input_ids.to(DEVICE)
    attention_masks = attention_masks.to(DEVICE)
    labels = labels.to(DEVICE)
    output = model(input_ids, attention_masks)

    # Round output logits to either 0 or 1 to indicate negative or positive sentiment.
    rounded_output = torch.round(torch.sigmoid(output))
    
    # Compare model output to labels and accumulate correct answers.
    corrects += (rounded_output == labels).sum().item()
    total += len(labels)

accuracy = corrects / total
accuracy

0.7763761467889908

In [13]:
# Set model into evaluation mode, not allowing parameter changes.
model.eval()

# Track correct classifications for total dataset accuracy calculation.
corrects = 0
total = 0

for input_ids, attention_masks, labels in train_dataloader:
    # Send all data elements to GPU. Call model on indidual batches.
    input_ids = input_ids.to(DEVICE)
    attention_masks = attention_masks.to(DEVICE)
    labels = labels.to(DEVICE)
    output = model(input_ids, attention_masks)

    # Round output logits to either 0 or 1 to indicate negative or positive sentiment.
    rounded_output = torch.round(torch.sigmoid(output))
    
    # Compare model output to labels and accumulate correct answers.
    corrects += (rounded_output == labels).sum().item()
    total += len(labels)

accuracy = corrects / total

corrects, total, accuracy

(64438, 67349, 0.9567773834800813)

In [14]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
formatted_num_params = "{:,}".format(num_params)
formatted_num_params

'21,935,617'

In [None]:
# Save model for use in application later.
torch.save(model.state_dict(), 'modelCheckpoint.pt')