# LAB 9: Sentiment analysis using Deep Learning

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Set-up

In [2]:
import time
from collections import Counter

import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader
from torchtext.vocab import Vocab

Connect to the GPU (training RNNs without a GPU is veeery slow)

In [3]:
device = torch.device("cuda")
torch.cuda.get_device_name(0)

'NVIDIA Tesla T4'

Load data

In [4]:
df = pd.read_parquet("s3://ling583/sentiment.parquet", storage_options={"anon": True})

In [5]:
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

### Training loop

This training loop is very similar to the one we used in the previous notebook, but with small changes to work with Huggingface models. 

In [6]:
def collate_batch(batch):
    labels, texts = zip(*batch)
    (inputs,) = (
        tokenizer(list(texts), truncation=True, padding=True, return_tensors="pt"),
    )
    labels = torch.tensor([label_vocab[l] for l in labels], dtype=torch.int64)
    return labels, inputs["input_ids"], inputs["attention_mask"]

In [7]:
def decision_function(dataloader):
    model.eval()
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=True):
            results = []
            for _, input_ids, attention_mask in dataloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                outputs = model(input_ids, attention_mask).logits
                results.extend(outputs)
                # print (results)
    return results


def predict(dataloader):
    predicted = decision_function(dataloader)
    return [label_vocab.itos[p.argmax()] for p in predicted]

In [8]:
def fit(
    epochs=5,
    batch_size=64,
    wd=None,
):
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    train_dataset = list(zip(train["sentiment"], train["text"]))
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_batch,
    )

    for epoch in range(1, epochs + 1):

        start = time.time()

        model.train()
        correct = 0
        for labels, input_ids, attention_mask in tqdm(train_dataloader):
            optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=True):
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outputs = model(
                    input_ids=input_ids, attention_mask=attention_mask, labels=labels
                )
                loss = outputs[0]
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

    elapsed = time.time() - start
    print(f"Epoch: {epoch:2d} Time: {elapsed:6.2f}s")

### Instantiate model

When using a pre-trained model, most of the training choices are already made for us, which makes things a lot easier! We'll use [this model](https://huggingface.co/distilbert-base-uncased)

In [9]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [10]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

In [11]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label_vocab)
).to(device)

fit(epochs=1, batch_size=16)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

  0%|          | 0/2500 [00:00<?, ?it/s]

Epoch:  1 Time: 779.82s


In [12]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(tqdm(test_dataloader))
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

  0%|          | 0/2500 [00:00<?, ?it/s]

Accuracy = 91.780 F1 = 89.718


In [13]:
model.save_pretrained("distilbert.1")
tokenizer.save_pretrained("distilbert.1")

('distilbert.1/tokenizer_config.json',
 'distilbert.1/special_tokens_map.json',
 'distilbert.1/vocab.txt',
 'distilbert.1/added_tokens.json')

In [14]:
# try second epoch
fit(epochs=1, batch_size=16)

  0%|          | 0/2500 [00:00<?, ?it/s]

Epoch:  1 Time: 795.12s


In [15]:
test_predicted = predict(tqdm(test_dataloader))
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

  0%|          | 0/2500 [00:00<?, ?it/s]

Accuracy = 91.970 F1 = 89.732


In [16]:
model.save_pretrained("distilbert.2")
tokenizer.save_pretrained("distilbert.2")

('distilbert.2/tokenizer_config.json',
 'distilbert.2/special_tokens_map.json',
 'distilbert.2/vocab.txt',
 'distilbert.2/added_tokens.json')

In [17]:
# try third eporch
fit(epochs=1, batch_size=16)

  0%|          | 0/2500 [00:00<?, ?it/s]

Epoch:  1 Time: 776.41s


In [18]:
test_predicted = predict(tqdm(test_dataloader))
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

  0%|          | 0/2500 [00:00<?, ?it/s]

Accuracy = 91.790 F1 = 89.484


At epoch 3, the accuracy and F1 scores decrease. Thus, the best epoch would be 2. 