# Large Language Models in limited hardware environments - training

W notebooku wytrenujesz model enkodera na przykładowym zbiorze danych (_banking77_) przy użyciu metod optymalizacyjnych z biblioteki 🤗 Accelerate.

Wykonaj zadania oznaczone jako _ToDo_, uzupełniając kod w miejscach oznaczonych jako `#FIXME`

In [None]:
!pip install transformers==4.28.1 torch==1.13.0 accelerate==0.18.0 datasets==2.1.0 evaluate==0.4.0 deepspeed==0.8.3

In [None]:
import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from accelerate import Accelerator, DistributedType, notebook_launcher, DeepSpeedPlugin
from tqdm import tqdm

from typing import Dict

In [None]:
def get_dataloaders(accelerator: Accelerator, model_id: str, batch_size: int):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    datasets = load_dataset("banking77")

    def tokenize_function(examples):
        outputs = tokenizer(examples["text"], truncation=True, max_length=None)
        return outputs
    
    # ToDo 1: Tokenize the dataset using main_process_first() context manager.
    # https://huggingface.co/docs/accelerate/concept_guides/deferring_execution
    ## with #FIXME: 
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["text"],
        )
    
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        pad_to_multiple_of = None
        max_length = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["test"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=batch_size
    )

    return train_dataloader, eval_dataloader

Do uzupełnienia ToDo's w kodzie wykorzystaj [Accelerator API](https://huggingface.co/docs/accelerate/package_reference/accelerator)

In [None]:
def train_with_accelerate(train_args: Dict, model_id: str, mixed_precision: str="no", ds_config: str=None):
    
    if ds_config is None:
        # ToDo 2: Initialize accelerator (pass mixed_precision argument to it).
        ## accelerator = #FIXME
    else:
        pass
        # ToDO 17: Initialize accelerator. Pass both mixed_precision and deepspeed_plugin to it (use DeepSpeedPlugin).   
        ## accelerator = #FIXME
          
    train_dataloader, eval_dataloader = get_dataloaders(accelerator, model_id, train_args["batch_size"])
    
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=77)
    # ToDo 3: Change the code below to to let accelerator handle the device placement.
    device = torch.device("cuda")
    model = model.to(device)

    optimizer = AdamW(params=model.parameters(), lr=train_args["lr"])
    gradient_accumulation_steps = train_args["gradient_accumulation_steps"]
    
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=train_args["num_warmup_steps"],
        num_training_steps=(len(train_dataloader) * train_args["num_epochs"]) // gradient_accumulation_steps,
    )
    
    # ToDo 4: Prepare all objects for distributed training. There is no specific order to remember, you just need 
    # to unpack the objects in the same order you gave them to the prepare method.
    ## model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = #FIXME
      
    metric = evaluate.load(train_args["metric"])

    for epoch in range(train_args["num_epochs"]):
        model.train()
        pbar = tqdm(train_dataloader)
        
        for step, batch in enumerate(pbar):
            # ToDo 5: Change the code below to to let accelerator handle the device placement.
            batch.to(device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            
            # ToDo 6: Use the accelerator object for the backward pass.
            ## FIXME
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                
            pbar.set_description(f"epoch {epoch} iter {step}: train loss {loss.item():.5f}.")

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            # ToDo 7: Change the code below to to let accelerator handle the device placement.
            batch.to(device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            # ToDo 8: Gathers predictions and targets from used devices for metric calculation.
            ## predictions, references = #FIXME
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        
        accelerator.print(f"epoch {epoch}:", eval_metric)
    
    # ToDo 9: To save the model afterwards to use for inference wait for all of the processes to be aligned
    # https://huggingface.co/docs/accelerate/v0.18.0/en/package_reference/accelerator#synchronicity-control
    ## #FIXME
    
    # unwrap the model from any distributed wrapping that was performed
    # https://huggingface.co/docs/accelerate/v0.18.0/en/package_reference/accelerator#synchronicity-control
    ## #FIXME

    # ToDo 10: Use save() instead of torch.save to save the model once (as all workers have the same model with the same weights now)
    ## #FIXME

In [None]:
MODEL_ID = "bert-large-cased"

In [None]:
train_params = {
    "lr": 2e-5, 
    "num_epochs": 5, 
    "batch_size": 32, 
    "metric": "accuracy", 
    "gradient_accumulation_steps": 8, 
    "num_warmup_steps": 100
}

## Step 1: Baseline implementation.
Modify to-dos from 1 and 10 in `get_dataloaders` and `train_with_accelerate` functions to run the baseline implementation of the training loop.

In [None]:
notebook_launcher(
    train_with_accelerate, 
    args=(train_params, MODEL_ID, ), 
    num_processes=1
)

## Step 2: Distributed Data Parallelism.
Modify the cells below to execute the same previous training loop with Distribute Data Parallelism using 2 GPUs.

In [None]:
# ToDo 11: Adjust the batch size per GPU and the number of gradient accumulation steps for the current settings (2 GPUs) so that the global batch size is still 256.
train_params = {
    "lr": 2e-5, 
    "num_epochs": 5, 
    "batch_size": #FIXME, 
    "metric": "accuracy", 
    "gradient_accumulation_steps": #FIXME, 
    "num_warmup_steps": 100
}

In [None]:
# ToDo 12: Run training on 2 GPUs.
notebook_launcher(
    train_with_accelerate, 
    args=(train_params, MODEL_ID, ), 
    num_processes=#FIXME
)

## Step 3: Mixed precision training.
Modify `ToDo 13` to run mixed precision training.

In [None]:
# ToDo 13: Pass proper mixed_precision argument to train_with_accelerate to run training in fp16.
notebook_launcher(
    train_with_accelerate, 
    args=(train_params, MODEL_ID, #FIXME, ), 
    num_processes=#FIXME
)

## Step 4: ZeRO Stage 3.
_To-dos_ od 14 do 17:
Przygotuj plik konfiguracyjny do uruchomienia optymalizacji ZeRO Stage 3. Sprawdź [dokumentację accelerate](https://huggingface.co/docs/accelerate/v0.18.0/en/usage_guides/deepspeed#deepspeed-config-file) w celu poznania szczegółów.

In [None]:
# ToDo 14: Adjust the batch size per GPU and the number of gradient accumulation steps.
train_params = {
    "lr": 2e-5, 
    "num_epochs": 5, 
    "batch_size": #FIXME, 
    "metric": "accuracy", 
    "gradient_accumulation_steps": #FIXME, 
    "num_warmup_steps": 100
}

In [None]:
%%writefile ds_zero_config.json
{
    # ToDo 15: Enable zero_optimization stage 3 with CPU offload for both parameters and optimizer states.
    ## "zero_optimization": FIXME  
    "gradient_accumulation_steps": #FIXME,
    "gradient_clipping": "auto",
    "steps_per_print": 100,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": #FIXME,
    "wall_clock_breakdown": false
}

In [None]:
# ToDo 16: Run training with ZeRo Stage 3 optimizer.
notebook_launcher(
    train_with_accelerate, 
    args=(train_params, MODEL_ID, "fp16", #FIXME), 
    num_processes=#FIXME
)