# Library Setup

In [2]:
%pip install transformers torch datasets --quiet

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
from transformers import AutoTokenizer, T5EncoderModel, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
import torch
import datasets
import numpy as np
import tqdm

# Retrieve Objects

## Grab Model

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to(DEVICE)

## Grab Data

In [5]:
text_dataset = datasets.load_dataset('imdb')

Found cached dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

# Prep Data for Pre-Training

## Split reviews to seq2seq examples

In [6]:
def make_sequences_for_training(text_inp, pt=False):
    max_blanks = 100
    blank_text_indicator = '<extra_id_'

    words = text_inp.split(' ')

    seq1 = []
    seq2 = []
    seq1_id = 0
    seq2_id = 0

    for ind, word in enumerate(words):
        if ind % 2 == 0:
            seq1.append(word)
            seq2.append(f'{blank_text_indicator}{seq1_id}>')
            seq2_id += 1
        else:
            seq1.append(f'{blank_text_indicator}{seq1_id}>')
            seq2.append(word)
            seq1_id += 1
    seq1 = ' '.join(seq1)
    seq2 = ' '.join(seq2)
    return seq1, seq2

In [7]:
example_text = text_dataset['unsupervised']['text'][0]
s1, s2 = make_sequences_for_training(example_text, pt=True)
input_ids = tokenizer(s1, return_tensors="pt").input_ids.to(DEVICE)
labels = tokenizer(s2, return_tensors="pt").input_ids.to(DEVICE)

with torch.no_grad():
    out = model(input_ids=input_ids, labels=labels)
print(out.loss)

tensor(14.5640, device='cuda:0')


## Apply mapping to HF dataset

In [8]:
def make_sequences(ex):
    s1, s2 = make_sequences_for_training(ex['text'], pt=False)

    input_ids = tokenizer(s1, truncation=True, padding='max_length').input_ids
    labels = tokenizer(s2, truncation=True, padding='max_length').input_ids
    ex['input_ids'] = input_ids
    ex['labels'] = labels
    
    return ex

In [16]:
train_dataset = text_dataset['test'].filter(lambda example, indice: indice % 10 == 0, with_indices=True)
train_dataset = train_dataset.map(make_sequences, remove_columns=['text', 'label'])
train_dataset

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 2500
})

In [17]:
eval_dataset = text_dataset['test'].filter(lambda example, indice: indice % 500 == 0, with_indices=True)
eval_dataset = eval_dataset.map(make_sequences, remove_columns=['text', 'label'])
eval_dataset

Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-7f5fa1573d4cd14e.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-12556177b804ff41.arrow


Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 50
})

# Training

In [19]:
training_args = TrainingArguments(
    output_dir='./flan-pretraining',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
trainer.train()

***** Running training *****
  Num examples = 2500
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1875
  Number of trainable parameters = 76961152


Epoch,Training Loss,Validation Loss
1,5.4225,2.338721
2,2.3594,1.984856
3,2.0486,1.937346


***** Running Evaluation *****
  Num examples = 50
  Batch size = 4
Saving model checkpoint to ./flan-pretraining/checkpoint-625
Configuration saved in ./flan-pretraining/checkpoint-625/config.json
Configuration saved in ./flan-pretraining/checkpoint-625/generation_config.json
Model weights saved in ./flan-pretraining/checkpoint-625/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 50
  Batch size = 4
Saving model checkpoint to ./flan-pretraining/checkpoint-1250
Configuration saved in ./flan-pretraining/checkpoint-1250/config.json
Configuration saved in ./flan-pretraining/checkpoint-1250/generation_config.json
Model weights saved in ./flan-pretraining/checkpoint-1250/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 50
  Batch size = 4
Saving model checkpoint to ./flan-pretraining/checkpoint-1875
Configuration saved in ./flan-pretraining/checkpoint-1875/config.json
Configuration saved in ./flan-pretraining/checkpoint-1875/generation_config.json
Model wei

TrainOutput(global_step=1875, training_loss=3.01926591796875, metrics={'train_runtime': 755.6706, 'train_samples_per_second': 9.925, 'train_steps_per_second': 2.481, 'total_flos': 1394178785280000.0, 'train_loss': 3.01926591796875, 'epoch': 3.0})

# Train on Ephemeral SageMaker Instance