# LAB 9: Sentiment analysis using Deep Learning

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Set-up

In [2]:
import time
from collections import Counter

import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import Vocab, build_vocab_from_iterator

Connect to the GPU (training RNNs without a GPU is veeery slow)

In [3]:
device = torch.device("cuda")
torch.cuda.get_device_name(0)

'NVIDIA Tesla T4'

Load data

In [4]:
df = pd.read_parquet("s3://ling583/sentiment.parquet", storage_options={"anon": True})

In [6]:
# split train/test set 
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

### Training loop

Because every problem and every model is a little bit different, pytorch (unlike scikit-learn) doesn't have built-in `fit` and `predict` methods. We need to define them ourselves here.

This function gathers up a batch of training examples, encodes them, and sends them to the GPU for processing.

In [7]:
# take review, process, and transfer to GPU processing
def collate_batch(batch):
    labels, texts = zip(*batch)
    texts = [
        [vocab[token] for token in ["<s>"] + tokenize(t) + ["</s>"]] for t in texts
    ]
    texts = [torch.tensor(t, dtype=torch.int64) for t in texts]
    texts = pad_sequence(texts, padding_value=vocab["<pad>"])
    labels = torch.tensor([label_vocab[l] for l in labels], dtype=torch.int64)
    return labels.to(device), texts.to(device)

This one applies the model to some test data, for evaluation. 

In [8]:
def decision_function(dataloader):
    model.eval()
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=True):
            results = []
            for _, text in dataloader:
                results.extend(model(text))
    return results


def predict(dataloader):
    predicted = decision_function(dataloader)
    return [label_vocab.itos[p.argmax()] for p in predicted]

And this is the important part: the function that actually trains the model! 

In [9]:
def fit(epochs=5, batch_size=64, wd=None, clip=None):
    criterion = torch.nn.CrossEntropyLoss()
    if wd:
        optimizer = torch.optim.AdamW(model.parameters(), weight_decay=wd)
    else:
        optimizer = torch.optim.Adam(model.parameters())
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    t, v = train_test_split(train, test_size=0.1, stratify=train["sentiment"])
    train_dataset = list(zip(t["sentiment"], t["text"]))
    valid_dataset = list(zip(v["sentiment"], v["text"]))

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_batch,
    )
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_batch,
    )

    for epoch in range(1, epochs + 1):

        start = time.time()

        model.train()
        correct = 0
        n = 0
        for label, text in tqdm(train_dataloader):
            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=True):
                predicted = model(text)
                loss = criterion(predicted, label)
                correct += (predicted.argmax(1) == label).sum().item()
                n += len(label)
            scaler.scale(loss).backward()
            if clip:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            scaler.step(optimizer)
            scaler.update()

        elapsed = time.time() - start
        train_acc = correct / n * 100.0
        valid_pred = predict(valid_dataloader)
        valid_acc = accuracy_score(v["sentiment"], valid_pred) * 100.0

        print(
            f"Epoch: {epoch:2d} Time: {elapsed:6.3f}s "
            f"Train acc: {train_acc:5.3f} Valid acc: {valid_acc:5.3f}"
        )

----

### Define the model class

Okay, most of what's above is mostly [boilerplate](https://en.wikipedia.org/wiki/Boilerplate_code). Now we'll define the specific model and hyperparameter settings that we're using for this task.

First, the model architecture. This is a basic [RNN](https://en.wikipedia.org/wiki/Recurrent_neural_network), using [GRU](https://en.wikipedia.org/wiki/Gated_recurrent_unit)s are the recurrent units.

The hyperparameters of interest are:

* `hidden_size`
* `embedding_size`
* `hidden_layers`
* `bidirectional`
* `dropout`

They control the ability of the model to learn details. Higher values for the first 3, plus setting `bidirectional` to `True`, increase the representational power of the model. That means it can learn more complex patterns and learn them more quickly. If these values are set too high, though, then the model can learn *too* well--it will simply memorize the training data and you'll get overfitting. The last value, dropout, helps control that. Higher values for `dropout` reduce the model's ability to learn and slow down training. The trick is finding a balance among all these settings that maximize learning while minimizing overfitting, which is unfortunately not easy to achieve.

In [10]:
class TextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab,
        num_class,
        hidden_size,
        embedding_dim=128,
        hidden_layers=1,
        dropout=0.0,
        bidirectional=True,
    ):
        super(TextClassificationModel, self).__init__()
        if not vocab.vectors is None:
            self.embedding = nn.Embedding.from_pretrained(
                vocab.vectors, freeze=True, padding_idx=vocab["<pad>"]
            )
        else:
            self.embedding = nn.Embedding(
                len(vocab), embedding_dim, padding_idx=vocab["<pad>"]
            )

        self.rnn = nn.GRU(
            input_size=self.embedding.embedding_dim,
            hidden_size=hidden_size,
            num_layers=hidden_layers,
            bidirectional=bidirectional,
            dropout=dropout,
        )
        self.dropout = nn.Dropout(dropout)
        out_size = hidden_size * hidden_layers
        if bidirectional:
            out_size = out_size * 2
        self.fc = nn.Linear(out_size, num_class)

    def forward(self, text, lengths=None):
        embedded = self.embedding(text)
        _, hidden = self.rnn(embedded)
        out = torch.cat(torch.unbind(hidden), axis=1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

Next we set up the vocabulary (this is the step performed by `CountVectorizer` in scikit-learn) using a [basic tokenizer](https://pytorch.org/text/stable/data_utils.html#get-tokenizer) that comes with pytorch. 

The "specials" are vocabulary items that don't correspond to words but are used internally by the model:

* `<pad>` : For implementation reasons the documents in a batch all have to be the same length, so we add copies of the pseudo-word `<pad>` to the end of shorter reviews to make them as long as the longest one. 
* `<s>`, `</s>` : These mark the beginning and end of the reviews.
* `<unk>` : Unknown words (i.e., words which are used in the test data that didn't get seen in the training data) get replaced with `<unk>`

There's one adjustable parameter here: raising the value of `min_freq` removes low frequency lexical items (similar to `min_df` in scikit-learn). Increasing it usually doesn't improve the model.

In [11]:
tokenize = get_tokenizer("basic_english")
counter = Counter(concat(map(tokenize, tqdm(train["text"]))))
vocab = Vocab(
    counter,
    min_freq=1, # 1 means include everything
    specials=("<pad>", "<s>", "</s>", "<unk>"),
)
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

  0%|          | 0/40000 [00:00<?, ?it/s]

Now we instantiate the model:

In [12]:
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128,
    hidden_layers=2,
    dropout=0.0,
    bidirectional=True,
).to(device)

And finally, we train! There are three important settings here:

* `epochs` : This is the number of passes over the training data that we make when fitting the model. A crude way to avoid overfitting is to reduce this, which stops before training before the model has converged.
* `batch_size` : This is the number of reviews that get processed at once during training. In general, increasing `batch_size` makes the program run faster (since it lets us take better advantage of the GPU) but may require more epochs to converge. Setting `batch_size` too high can overload the GPUs memory and lead to a crash. The effects of changing `batch_size` on the final results are hard to predict, but it can make a big difference.
* `wd` : This is the "[weight decay](https://www.fast.ai/2018/07/02/adam-weight-decay/)" parameter. Setting this to a value other than `None` regularizes the model and can reduce overfitting (similar to setting `alpha` for `SGDClassifier`).

In [13]:
fit(epochs=5, batch_size=64, wd=None)
# if acc started to go down, it's starting to be underfitting

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 74.916s Train acc: 80.969 Valid acc: 87.300


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 73.924s Train acc: 90.478 Valid acc: 88.800


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 78.823s Train acc: 93.081 Valid acc: 89.725


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  4 Time: 77.448s Train acc: 95.389 Valid acc: 89.650


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  5 Time: 75.946s Train acc: 97.267 Valid acc: 89.775


In [14]:
# accuracy of the train set is increasing each time 
# accuracy of the validation set is also increasing each time 
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 90.010 F1 = 87.104


The accuracy and F1 score here is really good. But we will try different parameters to see. 

In [15]:
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128,
    hidden_layers=2,
    dropout=0.5, # change 0.0 to 0.5 = after each batch, half of the network will turn off , only half of the model get adjusted, different half each time  
    bidirectional=True,
).to(device)

In [16]:
fit(epochs=5, batch_size=64, wd=None)

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 90.326s Train acc: 80.433 Valid acc: 88.825


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 90.334s Train acc: 90.053 Valid acc: 88.850


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 92.697s Train acc: 92.608 Valid acc: 89.625


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  4 Time: 90.942s Train acc: 94.561 Valid acc: 89.200


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  5 Time: 90.266s Train acc: 96.375 Valid acc: 89.575


In [17]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 90.410 F1 = 87.639


In [19]:
# rerun the model to restart the epic accuracy 
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128,
    hidden_layers=2,
    dropout=0.5, 
    bidirectional=True,
).to(device)

In [20]:
# if the number start to drop out after a certain epic, we can change the number of epochs to that number to avoid getting lower accuracy score 
fit(epochs=3, batch_size=64, wd=None)

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 89.223s Train acc: 79.044 Valid acc: 87.125


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 91.005s Train acc: 89.494 Valid acc: 90.400


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 89.895s Train acc: 92.339 Valid acc: 90.275


In [23]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 90.860 F1 = 87.906


### Run the Model using glove 

In [22]:
tokenize = get_tokenizer("basic_english")
counter = Counter(concat(map(tokenize, tqdm(train["text"]))))
vocab = Vocab(
    counter,
    min_freq=1, # 1 means include everything
    specials=("<pad>", "<s>", "</s>", "<unk>"),
)
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

  0%|          | 0/40000 [00:00<?, ?it/s]

In [21]:
vocab.load_vectors(
    "glove.6B.200d", unk_init=lambda t: torch.nn.init.uniform_(t, -1.0, 1.0)
)

.vector_cache/glove.6B.zip: 862MB [02:43, 5.28MB/s]                               
100%|█████████▉| 399999/400000 [00:21<00:00, 19010.91it/s]


Now we instantiate the model:

In [24]:
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128,
    hidden_layers=2,
    dropout=0.5, 
    bidirectional=True,
).to(device)

In [25]:
fit(epochs=3, batch_size=64, wd=None)

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 89.936s Train acc: 79.903 Valid acc: 88.875


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 94.186s Train acc: 89.958 Valid acc: 90.325


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 92.789s Train acc: 92.075 Valid acc: 90.925


In [29]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 90.000 F1 = 87.569


In [27]:
fit(epochs=2, batch_size=64, wd=None) # add 2 more epoch to see what a 5 epoch would look like

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 91.334s Train acc: 93.800 Valid acc: 93.700


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 90.889s Train acc: 95.528 Valid acc: 93.475


In [30]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 90.000 F1 = 87.569
