In [1]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


## Data loading

In [2]:
with open('gen-ai-ucu-2024-task-3/zno.train.jsonl', 'r') as json_file:
    json_list = list(json_file)

all_questions = []
for json_str in json_list:
    result = json.loads(json_str)
    result['correct_answers'] = result['correct_answers'][0]
    all_questions.append(result)

train_set, test_set = all_questions[int(len(all_questions)*0.2):], all_questions[:int(len(all_questions)*0.2)]

In [3]:
import pandas as pd
df_data = pd.DataFrame(train_set)

In [4]:
from unsloth import FastLanguageModel
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",#"unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,                                                                
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.576 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3",model_kwargs = {"trust_remote_code": True})

vector_store = Chroma(
    collection_name="ukrainian_language",
    persist_directory="./chroma_db",
    embedding_function=embeddings,
)

def get_similar_tasks(query:str) -> list[str]:
    retrieved_docs = vector_store.similarity_search(query,k=5)
    docs_list = [doc.page_content for doc in retrieved_docs[:1]]# skip the first one since it is the same as question
    return docs_list

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.1.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
llama31_prompt='''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Your input fields are:
1. `question` (str)
2. `options` (list[dict[str, str]])
3. `similar_examples` (list[str])

Your output fields are:
1. `correct_marker` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## options ## ]]
{options}

[[ ## similar_examples ## ]]
{similar_examples}

[[ ## correct_marker ## ]]
{correct_marker}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Solve exam problem.<|eot_id|><|start_header_id|>user<|end_header_id|>

[[ ## question ## ]]
{question_input}

[[ ## options ## ]]
{options_input}

[[ ## similar_examples ## ]]
{similar_examples_input}

Respond with the corresponding output fields, starting with the field `[[ ## correct_marker ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


<|start_header_id|>assistant<|end_header_id|>
[[ ## correct_marker ## ]]
{correct_marker_input}

[[ ## completed ## ]]<|eot_id|>'''

In [8]:
import pandas as pd
def format_prompt(row):
    similar_examples = get_similar_tasks(row['question'])
    return (llama31_prompt
            .replace("{question_input}", row["question"])
            .replace("{options_input}", str(row["answers"]))
            .replace("{correct_marker_input}", row["correct_answers"])
            .replace("{similar_examples_input}",str(similar_examples)))


def prepare_train_datav2(train_data:dict):
    # Convert the datax to a Pandas DataFrame
    data_df = pd.DataFrame(train_data)
    # Create a new column called "text"
    data_df["text"] = data_df.apply(format_prompt, axis=1)
    # Create a new Dataset from the DataFrame
    data = Dataset.from_pandas(data_df)
    return data

In [9]:
dataset = prepare_train_datav2(train_set)

In [10]:
print(dataset[-1]['text'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Your input fields are:
1. `question` (str)
2. `options` (list[dict[str, str]])
3. `similar_examples` (list[str])

Your output fields are:
1. `correct_marker` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## options ## ]]
{options}

[[ ## similar_examples ## ]]
{similar_examples}

[[ ## correct_marker ## ]]
{correct_marker}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Solve exam problem.<|eot_id|><|start_header_id|>user<|end_header_id|>

[[ ## question ## ]]
Ознаки постмодернізму відчутні в рядках

[[ ## options ## ]]
[{'marker': 'А', 'text': '«СЕРЦЕ, КИНУТЕ В ЮРБУ, ВИБУХАЄ: - БУ! - БА! - БУ! / Небо повне ірами, тріпотіло крилами...»'}, {'marker': 'Б', 'text': '«В соняшника були руки і ноги, / Було тіло шорстке і зелене. / Він бігав н

In [11]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

response_template = "<|start_header_id|>assistant<|end_header_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [12]:
# Tokenize
tokenized = tokenizer(dataset[-1]['text'], return_tensors="pt", padding=True)

# Convert to the format expected by the collator
tokenized_example = {
    'input_ids': tokenized['input_ids'][0],  # remove the batch dimension
    'attention_mask': tokenized['attention_mask'][0]
}

# Apply the collator - it expects a list of examples
processed = collator([tokenized_example])

# Decode the labels to see what part is being trained on
labels = processed['labels'][0]
mask = labels != -100
training_text = tokenizer.decode(labels[mask])
print("Training target:", training_text)

Training target: 
[[ ## correct_marker ## ]]
А

[[ ## completed ## ]]<|eot_id|>


In [13]:
len(dataset)

2451

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    data_collator=collator,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 10, 
        # max_steps = 120,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs_10epochs_rag",
    ),
)

Map (num_proc=2): 100%|██████████| 2451/2451 [00:02<00:00, 1041.34 examples/s]


In [15]:
trainer_stats = trainer.train(resume_from_checkpoint=True)

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,451 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 3,060
 "-____-"     Number of trainable parameters = 41,943,040
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss
1001,0.0005
1002,0.0207
1003,0.008
1004,0.0068
1005,0.0332
1006,0.0011
1007,0.0055
1008,0.0113
1009,0.0018
1010,0.005


KeyboardInterrupt: 

In [16]:
model.save_pretrained("./lora_adapter_llama8b_6epochs_rag")
tokenizer.save_pretrained("./lora_adapter_llama8b_6epochs_rag")

('./lora_adapter_llama8b_6epochs_rag/tokenizer_config.json',
 './lora_adapter_llama8b_6epochs_rag/special_tokens_map.json',
 './lora_adapter_llama8b_6epochs_rag/tokenizer.json')

In [66]:
model.save_pretrained_merged("llama8b120_merged_16bit", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 7.48 out of 31.18 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 91%|█████████ | 29/32 [00:00<00:00, 48.36it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:01<00:00, 22.77it/s]


Unsloth: Saving tokenizer... Done.


  gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
  mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
  f"   \\\   /|    [0] Installing llama.cpp might take 3 minutes.\n"\
  f"O^O/ \_/ \\    [1] Converting HF to GGUF 16bits might take 3 minutes.\n"\
  f"\        /    [2] Converting GGUF 16bits to {quantization_method} might take 10 minutes each.\n"\


SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

In [38]:
from unsloth import FastLanguageModel
max_seq_length = 4096
dtype = None
load_in_4bit = True

model2, tokenizer2 = FastLanguageModel.from_pretrained(
    model_name = "./lora_adapter_llama3b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    # load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FileNotFoundError: ./lora_adapter_llama3b/*.json (invalid repository id)

In [2]:
# alpaca_prompt = Copied from above
model2 = FastLanguageModel.for_inference(model2) # Enable native 2x faster inference


In [7]:
inputs = tokenizer2(
[
    llama31_prompt.format(
        "", # instruction
        prompts[0].replace("\nhuman: в меню\n","human: в меню") + "ai: Натисніть клавішу 1 для того щоб повторити замовлення, 2 для того щоб дізнатися поточні знижки\n\nhuman: Яке моє минуле замовлення?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model2.generate(**inputs, max_new_tokens = 2000, use_cache = True)
tokenizer2.batch_decode(outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\nYou are the voicebot for Alyaska named Mariya, designed to assist customers in placing their orders. \n\nKey Features:\n\n    Language:\n        Respond exclusively in Ukrainian.\n\n    General Guidance:\n        Maintain a friendly and natural tone while accurately interpreting the user\'s intent.\n        Answer last user message given in a natural, human-like manner.\n \n    Here are example conversations with explanations:\n        Example 0:\n        human : Я хочу замовити лампи.\n        (Searching in the catalogue for лампи found "Лампа RGB" with price 114 грн add it to unconfirmed_products, we use specific product name from catalogue)\n        ai: Лампа RGB 1 штука, правильно?\n        human : Так і ще давайте чашку.\n        (User approved Лампа RGB with quantity 1 move to confirmed_products; search in catalogue for чашка found "Чашка подарункова" with pric

In [11]:
prompts[0]

'\nYou are the voicebot for Alyaska named Mariya, designed to assist customers in placing their orders. \n\nKey Features:\n\n    Language:\n        Respond exclusively in Ukrainian.\n\n    General Guidance:\n        Maintain a friendly and natural tone while accurately interpreting the user\'s intent.\n        Answer last user message given in a natural, human-like manner.\n \n    Here are example conversations with explanations:\n        Example 0:\n        human : Я хочу замовити лампи.\n        (Searching in the catalogue for лампи found "Лампа RGB" with price 114 грн add it to unconfirmed_products, we use specific product name from catalogue)\n        ai: Лампа RGB 1 штука, правильно?\n        human : Так і ще давайте чашку.\n        (User approved Лампа RGB with quantity 1 move to confirmed_products; search in catalogue for чашка found "Чашка подарункова" with price 321 грн add it to unconfirmed_products, use specific name from catalogue)\n        ai: Внесла в замовлення Лампа RGB