In [1]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig

from peft import LoraConfig, get_peft_model, TaskType

In [2]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
# https://github.com/bitsandbytes-foundation/bitsandbytes
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

In [3]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [4]:
data = load_dataset('openai/gsm8k', 'main', split='train[:200]')

In [5]:
def tokenize(batch):
    texts = [
        f'### Instruction:\n{instruction}\n### Response:\n{out}'
        for instruction, out in zip(batch['question'], batch['answer'])
    ]
    tokens = tokenizer(
        texts,
        padding='max_length',
        max_length=256,
        truncation=True,
        return_tensors='pt',
    )
    tokens['labels'] = tokens['input_ids'].clone()
    return tokens


tokenize(data[:5])

{'input_ids': tensor([[   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2],
        [   1,  835, 2799,  ...,    2,    2,    2]])}

In [6]:
tokenized_data = data.map(tokenize, batched=True,
                          remove_columns=data.column_names)

In [9]:
training_arguments = TrainingArguments(
    output_dir='./rodmosc/tinyllama-lora',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-3,
    num_train_epochs=50,
    fp16=True,
    logging_steps=20,
    save_strategy='epoch',
    report_to='none',
    remove_unused_columns=False,
    label_names=['labels']
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_data,
    processing_class=tokenizer
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [11]:
%time trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
20,1060.6882
40,0.0
60,0.0
80,0.0
100,0.0
120,0.0
140,0.0
160,0.0
180,0.0
200,0.0


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 406bfeb1-9463-4c43-96db-99b67a5978dc)')' thrown while requesting HEAD https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 9be925bb-5a86-4be2-b7a4-14fd3f2f0c0f)')' thrown while requesting HEAD https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0/resolve/main/config.json
Retrying in 1s [Retry 1/5].


CPU times: user 8min 13s, sys: 2min 7s, total: 10min 20s
Wall time: 1h 44min 18s


TrainOutput(global_step=650, training_loss=32.63655949519231, metrics={'train_runtime': 6258.325, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.104, 'total_flos': 1.590741172224e+16, 'train_loss': 32.63655949519231, 'epoch': 50.0})

In [12]:
import os
import math
from torch.utils.data import DataLoader

from transformers import default_data_collator
from peft import PeftModel

In [15]:
adapter_path = './rodmosc/tinyllama-lora/checkpoint-650'
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
).eval()
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)
tmp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)
tuned_model = PeftModel.from_pretrained(tmp_model, adapter_path)
tuned_model = tuned_model.merge_and_unload().eval()

In [None]:
eval_dataset = data.map(tokenize, batched=True,
                        remove_columns=['question', 'answer'])
eval_dataset = eval_dataset.with_format('torch')

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
eval_loader = DataLoader(
    eval_dataset,
    batch_size=8,
    collate_fn=default_data_collator
)

In [22]:
@torch.no_grad()
def compute_perplexity(model, data_loader):
    losses = []
    for batch in data_loader:
        batch = {k: v.to('mps') for k, v in batch.items()}
        loss = model(**batch).loss
        losses.append(loss.item())
    return math.exp(sum(losses) / len(losses))

In [None]:
print(f'base model perplexity: {compute_perplexity(base_model, eval_loader)}')
print(
    f'tuned model perplexity: {compute_perplexity(tuned_model, eval_loader)}')

base model perplexity: 209.22177694621465
