# Large Language Models in limited hardware environments - training template

In [None]:
!pip install torch==1.13.0 transformers==4.28.1 datasets==2.1.0 evaluate==0.4.0

In [None]:
import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed

from tqdm import tqdm

from typing import Dict

In [None]:
def get_dataloaders(model_id: str, batch_size: int):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    datasets = load_dataset("banking77")

    def tokenize_function(examples):
        outputs = tokenizer(examples["text"], truncation=True, max_length=None)
        return outputs
    
    tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["text"],
        )
    
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    def collate_fn(examples):
        pad_to_multiple_of = None
        max_length = None

        return tokenizer.pad(
            examples,
            padding="longest",
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt",
        )

    train_dataloader = DataLoader(
        tokenized_datasets["train"], 
        shuffle=True, 
        collate_fn=collate_fn, 
        batch_size=batch_size, 
        drop_last=True
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["test"],
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=batch_size
    )

    return train_dataloader, eval_dataloader

In [None]:
def train(train_args: Dict, model_id: str, mixed_precision: str="no", ds_config: str=None):        
    train_dataloader, eval_dataloader = get_dataloaders(model_id, train_args["batch_size"])
    
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=77)
    device = torch.device("cuda")
    model = model.to(device)

    optimizer = AdamW(params=model.parameters(), lr=train_args["lr"])
    gradient_accumulation_steps = train_args["gradient_accumulation_steps"]
    
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=train_args["num_warmup_steps"],
        num_training_steps=(len(train_dataloader) * train_args["num_epochs"]) // gradient_accumulation_steps,
    )
          
    metric = evaluate.load(train_args["metric"])

    for epoch in range(train_args["num_epochs"]):
        model.train()
        pbar = tqdm(train_dataloader)
        
        for step, batch in enumerate(pbar):
            batch.to(device)
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            
            loss.backward()
            if step % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                
            pbar.set_description(f"epoch {epoch} iter {step}: train loss {loss.item():.5f}.")

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            batch.to(device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        
        print(f"epoch {epoch}:", eval_metric)
    model.save("model.pkl")

In [None]:
MODEL_ID = "bert-large-cased"

In [None]:
train_params = {
    "lr": 2e-5, 
    "num_epochs": 5, 
    "batch_size": 8, 
    "metric": "accuracy", 
    "gradient_accumulation_steps": 4, 
    "num_warmup_steps": 100
}

In [None]:
train(train_params, MODEL_ID)