## Import Libraries

In [8]:
import pickle
import torch
from transformers import AutoModelForSeq2SeqLM, get_scheduler, AutoTokenizer
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_metric

Load in the pickle files you created in our previous notebook

In [2]:
cd ..

/home/eanthony/workspace/github-work/aidiv-sagemaker-examples


In [9]:
with open('train_dataset.pkl', 'rb') as f:
    train_dataset = pickle.load(f)
    f.close
    
with open('test_dataset.pkl', 'rb') as f:
    test_dataset = pickle.load(f)
    f.close

We need to create a torch DataLoader to handle our data

In [10]:
train_dataloader = DataLoader(train_dataset, batch_size=1)
test_dataloader = DataLoader(test_dataset, batch_size=1)
small_train_dataloader = DataLoader(train_dataset.select(range(1000)))

## Load our model

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small')

Create an optimizer and learning scheduler for our training

In [12]:
optimizer = AdamW(model.parameters(), lr=5e-5, capturable=False)

NUM_EPOCHS = 3
NUM_TRAINING_STEPS = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(name='linear', optimizer = optimizer, num_warmup_steps=0, num_training_steps=NUM_TRAINING_STEPS)

Finally, if you are using a GPU, define your device

In [7]:
device = torch.device('cuda')
# device = torch.device('cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

## TRAINING TIME

In [14]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(NUM_TRAINING_STEPS))
model.train()

for epoch in range(NUM_EPOCHS):
    for batch in tqdm(train_dataloader):
#         batch = {a: b.to(device) for a, b in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

  0%|          | 0/789747 [00:00<?, ?it/s]

  0%|          | 0/263249 [00:00<?, ?it/s]

## Evaluate

In [None]:
metric = load_metric("rouge")
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small')

model.eval()
for batch in tqdm(small_train_dataloader):
    batch = {a: b.to(device) for a, b in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    decoded_predicts = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
    metric.add_batch(predictions=decoded_predicts, references=decoded_labels)
metric.compute()

## Save Model

In [None]:
model.save_pretrained('models/summarization-model')

## Use Accelerate and DeepSpeed to train your models faster!

If you have multiple GPUs and want to use distributed training, use the training.py script in this directory which uses HuggingFace's Accelerate package to train models fast with only a few lines of code change