In [None]:
!pip install transformers datasets
!pip install pandas
!pip install torch
!pip install tqdm

## Setup
1. Create virtual environment and download required packages (use pipenv). Alternatively, the following code block will run install scripts for the required packages. 

**To-Do**
- Add [Open in Colab](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb#scrollTo=8QAWNjizy_3O) to this notebook.

**Notes**
- Attention: which words are important for the decoder to focus on at a specific timestep?
    - Q = Query
    - K = Key
    - V = Value
- Self-attention: What if Q and K are both the same sentence.
- Multi-head Self-Attention: self-attention calculated independently and concurrently (allows transformers to learn representations at different positional encodings)

**Sources**
-  [Sentiment Analysis Text Classification Tutorial](https://www.youtube.com/watch?v=8N-nM3QW7O0)
- [Using Catalyst for Training Organization](https://github.com/catalyst-team/catalyst)



In [18]:
# Constants and imports
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
import torch
# TODO: env file
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # macOS incompatible with NVIDIA GPUs

PRETRAINED_MODEL_NAME = 'distilbert-base-cased' # Casing can matter ("bad" vs. "BAD")
RANDOM_SEED = 42
MAX_LEN = 160
BATCH_SIZE = 16
EPOCHS = 3

In [58]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast

# Load, preprocess, and encode data
dataset = load_dataset('imdb')
train = dataset['train'].select(range(30))
tokenizer = DistilBertTokenizerFast.from_pretrained(PRETRAINED_MODEL_NAME)


encoded_train_dataset = train.map(lambda batch: tokenizer(batch['text'], add_special_tokens=True, padding=True, truncation=True), batched=True)
encoded_train_dataset.set_format(type='torch', output_all_columns=True)
encoded_train_dataset

Reusing dataset imdb (/Users/danielpham/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)
Loading cached processed dataset at /Users/danielpham/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-f3f65001ae7f72b9.arrow


Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 30
})

In [59]:
from transformers import DistilBertModel
from torch import nn, optim

# Generic tester sentiment classifier
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes: int):
        super(SentimentClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained(PRETRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3) 
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes) # Linear feedforward networks
        self.softmax = nn.Softmax(dim=1)
    def forward(self, input_ids, attention_mask):
        # last_hidden_state of the last encoder unit in the bert model.
        # pooled output applies the activation function on the first token's hidden state
        # Bert Pooler uses tanh activation
        last_hidden_state, pooled_output = self.bert( 
            input_ids=input_ids, 
            attention_mask=attention_mask)
        output = self.drop(pooled_output)
        output = self.out(output)
        return self.softmax(output)


In [66]:
# Training
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader

model = SentimentClassifier(5)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
#TODO: Change num_training steps to get the length from a dataloader object?
dataloader = DataLoader(encoded_train, batch_size=BATCH_SIZE, num_workers = 4)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(encoded_train) * EPOCHS)

loss_fn = nn.CrossEntropyLoss()



In [22]:
reviews = load_dataset('amazon_reviews_multi', 'en')
reviews

Reusing dataset amazon_reviews_multi (/Users/danielpham/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/f3357bd271e187385a38574fe31b8fb10055303f67fa9fce55e84d08c4870efd)


DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

In [70]:
from typing import Tuple
def run_epoch(model, dataloader, loss_fn, optimizer,device,scheduler, n_examples) -> Tuple[float, float]:
    model = model.train()
    losses = []
    correct_predictions = 0

    for data in dataloader:
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        labels = data['label']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, pred = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad() # clear out gradients

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, dataloader, loss_fn, device, n_examples):
    model = model.eval()

    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    labels = data['label']

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    _, pred = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

        