Fine-tuning bloomz-560m using a small legal QA dataset (portuguese).

In [None]:
import pandas as pd
import torch
import json
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from accelerate import Accelerator

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
# Loading bloomz model and tokenizer 
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m").to("cuda")

In [4]:
# Loading dataset prompts.json built using de portuguese legalQA dataset
dataset = load_dataset("json", data_files="prompts.json")

Found cached dataset json (/home/dvianna/.cache/huggingface/datasets/json/default-5329563cfdcc119a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 395
    })
})

In [6]:
# prepare the data for training
def prepare_train_data(data):
    # prompt + completion
    #text_input = data['prompt'] + ' ' + data['completion']
    text_input = data['text']
    # tokenize the input (prompt + completion) text
    tokenized_input = tokenizer(text_input, return_tensors='pt', padding=True)
    # generative models: labels are the same as the input
    tokenized_input['labels'] = tokenized_input['input_ids']
    return tokenized_input

In [7]:
train_dataset = dataset['train'].map(prepare_train_data, batched=True, remove_columns=["text"])

Loading cached processed dataset at /home/dvianna/.cache/huggingface/datasets/json/default-5329563cfdcc119a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-3a16b40a0956a8e0.arrow


In [8]:
# setting arguments to be used during training
training_arguments = TrainingArguments(
    'LegalQA-bloom-560m',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True
)

In [9]:
trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset = train_dataset
)

In [10]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


TrainOutput(global_step=98, training_loss=5.802616041533801, metrics={'train_runtime': 257.7121, 'train_samples_per_second': 3.065, 'train_steps_per_second': 0.38, 'total_flos': 2008259929718784.0, 'train_loss': 5.802616041533801, 'epoch': 1.98})

In [11]:
trainer.save_model()