In [None]:
%pip install -q peft transformers datasets accelerate peft


In [None]:
%pip install -q -i  https://pypi.org/simple/ bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from typing import Dict, List
from datasets import Dataset, load_dataset, disable_caching
disable_caching()
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
from torch.utils.data import Dataset
from IPython.display import Markdown


In [None]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k" , split = 'train')
small_dataset = dataset.select([i for i in range(200)])
print(small_dataset)
print(small_dataset[0])


prompt_template = """Below is an mathematics word problem. Solve the problem and give answer to it. question: {question}\n answer:"""
answer_template = """{answer}"""


def _add_text(rec):
    instruction = rec["question"]
    response = rec["answer"]

    if not instruction:
        raise ValueError(f"Expected an instruction in: {rec}")
    if not response:
        raise ValueError(f"Expected a response in: {rec}")
    rec["prompt"] = prompt_template.format(question=instruction)
    rec["answer"] = answer_template.format(answer=response)
    rec["text"] = rec["prompt"] + rec["answer"]
    return rec


small_dataset = small_dataset.map(_add_text)
print(small_dataset[0])


Dataset({
    features: ['question', 'answer'],
    num_rows: 200
})
{'question': 'Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.', 'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.'}


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'question': 'Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.', 'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.', 'prompt': 'Below is an mathematics word problem. Solve the problem and give answer to it. question: Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.\n answer:', 'text': 'Below is an mathematics word problem. Solve the problem and give answer to it. question: Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.\n answer:If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.'}


In [None]:
from transformers import BitsAndBytesConfig

config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)


In [None]:
model_id = "databricks/dolly-v2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # use_cache=False,
    device_map="auto",
    quantization_config=config
)


model.resize_token_embeddings(len(tokenizer))


tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Embedding(50280, 2560)

In [None]:
from functools import partial
import copy
from transformers import DataCollatorForSeq2Seq

MAX_LENGTH = 256


def _preprocess_batch(batch: Dict[str, List]):
    model_inputs = tokenizer(batch["text"], max_length=MAX_LENGTH, truncation=True, padding='max_length')
    model_inputs["labels"] = copy.deepcopy(model_inputs['input_ids'])
    return model_inputs

_preprocessing_function = partial(_preprocess_batch)


encoded_small_dataset = small_dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["question", "prompt", "answer"],
)
processed_dataset = encoded_small_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)


split_dataset = processed_dataset.train_test_split(test_size=14, seed=0)
print(split_dataset)


data_collator = DataCollatorForSeq2Seq(
        model = model, tokenizer=tokenizer, max_length=MAX_LENGTH, pad_to_multiple_of=8, padding='max_length')


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 186
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
})


In [None]:
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training

LORA_R = 128
LORA_ALPHA = 512
LORA_DROPOUT = 0.05

lora_config = LoraConfig(
                 r = LORA_R,
                 lora_alpha = LORA_ALPHA,
                 lora_dropout = LORA_DROPOUT,
                 bias="none",
                 task_type="CAUSAL_LM",
                 target_modules=["query_key_value"],
)




model = prepare_model_for_kbit_training(model)


model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 41,943,040 || all params: 2,817,029,120 || trainable%: 1.4889104163751066


In [None]:
from transformers import TrainingArguments, Trainer
import bitsandbytes

EPOCHS = 2
LEARNING_RATE = 1e-4
MODEL_SAVE_FOLDER_NAME = "dolly-3b-lora"
training_args = TrainingArguments(
                    output_dir=MODEL_SAVE_FOLDER_NAME,
                    overwrite_output_dir=True,
                    fp16=True,
                    per_device_train_batch_size=1,
                    per_device_eval_batch_size=1,
                    learning_rate=LEARNING_RATE,
                    num_train_epochs=EPOCHS,
                    logging_strategy="epoch",
                    evaluation_strategy="epoch",
                    save_strategy="epoch",
)
# training the model
trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=split_dataset['train'],
        eval_dataset=split_dataset["test"],
        data_collator=data_collator,
)
model.config.use_cache = False
trainer.train()

trainer.model.save_pretrained(MODEL_SAVE_FOLDER_NAME)

trainer.save_model(MODEL_SAVE_FOLDER_NAME)
trainer.model.config.save_pretrained(MODEL_SAVE_FOLDER_NAME)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.8985,0.785868
2,0.5675,0.790774


Checkpoint destination directory dolly-3b-lora/checkpoint-186 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory dolly-3b-lora/checkpoint-372 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
def postprocess(response):
    messages = response.split("answer:")
    if not messages:
        raise ValueError("Invalid template for prompt. The template should include the term 'Response:'")
    return "".join(messages[1:])

inference_prompt = "Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook."

inf_pipeline =  pipeline('text-generation', model=trainer.model, tokenizer=tokenizer, max_length=256, trust_remote_code=True)

response = inf_pipeline(prompt_template.format(question=inference_prompt))[0]['generated_text']

formatted_response = postprocess(response)
formatted_response

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCa

'To find the number of people who crossed the finish line faster than Jungkook, we need to consider the order in which people crossed the finish line.\n\nJungkook crossed the finish line in 5th place, so we need to find the number of people who crossed the finish line in 5th place or less.\n\nThe order in which people crossed the finish line is:\n1. Jungkook\n2. Yoonjung\n3. Jungmin\n4. Yoonjung\n5. Jungmin\n6. Jungkook\n7. Yoonjung\n8. Jungmin\n9. Jungkook\n10. Yoonjung\n11. Jungmin\n12. Yoonjung\n13. Jungkook\n14. Yoonjung\n15. Jungmin\n16. Yoonjung\n17. Jungkook\n18. Yoonjung\n19. Jungmin\n20. Yoonjung\n21. Jungkook\n'