# **Install các thư viện :**

In [1]:
!pip install accelerate>=0.21.0,<0.23.0  
!pip install appdirs
!pip install bitsandbytes>=0.41.1
!pip install datasets
!pip install fire
!pip install gradio
!pip install loralib
!pip install peft
!pip install sentencepiece
!pip install scipy
!pip install transformers
!pip install torch 
!pip install requests

/bin/bash: 0.23.0: No such file or directory
Collecting black
  Downloading black-24.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting packaging>=22.0 (from black)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Collecting pathspec>=0.9.0 (from black)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Downloading black-24.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading packaging-24.1-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pathspec-0.12.1-py3-none-any.whl (31 kB)
Installing collected packages: pathspec, packaging, b

# **Import thư viện :**

In [2]:
import json
import os.path as osp
from typing import Union
import os
import sys
from typing import List

import fire
import torch
import transformers
from datasets import load_dataset

from peft import (
    LoraConfig,
    get_peft_model,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer

In [3]:
#Path : 
path_data_train_model = '/kaggle/input/data-nlp/Data_train/training_data.jsonl'
path_template = '/kaggle/input/data-nlp/Data_train/templates/alpaca.json'
path_output = 'ODIE_7b'

# **Prompter :**

In [4]:
"""
A dedicated helper to manage templates and prompt building.
"""
class Prompter():
    __slots__ = ("template", "_verbose")

    def __init__(self, verbose: bool = False):
        self._verbose = verbose
        
        #Read template and set it : 
        with open(path_template) as fp:
            self.template = json.load(fp)
        if self._verbose:
            print(
                f"Using prompt template {template_name}: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        if self._verbose:
            print(res)
        return res
    
    def generate_chat_prompt(
        self,
        messages: Union[None, str] = None,
    ) -> str:
        if len(messages) == 0:
            raise ValueError('Messages field is empty.')

        message_text = ""
        for message in messages:
            if message["role"] == "system":
                message_text += "<|system|>\n" + message["content"].strip() + "\n\n"
            elif message["role"] == "user":
                message_text += "<|user|>\n" + message["content"].strip() + "\n\n"
            elif message["role"] == "assistant":
                message_text += "<|assistant|>\n" + message["content"].strip() + "\n\n"
            else:
                raise ValueError("Invalid role: {}".format(message["role"]))
        return message_text

    def get_response(self, output: str, use_chat_prompt=False) -> str:
        if use_chat_prompt:
            return output.split('<|assistant|>\n')[-1].strip()
        else:
            return output.split(self.template["response_split"])[1].strip()

In [5]:
def train(
    # model/data params :
    base_model: str = "elinas/llama-7b-hf-transformers-4.29",  # the only required argument
    data_path: str = path_data_train_model,
    output_dir: str = path_output,
    #training hyperparams :
    batch_size: int = 64,
    micro_batch_size: int = 4,
    num_epochs: int = 3,
    learning_rate: float = 3e-4,
    cutoff_len: int = 2048,
    val_set_size: int = 1000,
    #lora hyperparams :
    lora_r: int = 16,
    lora_alpha: int = 16,
    lora_dropout: float = 0.05,
    lora_target_modules: List[str] = [
        "q_proj","v_proj","k_proj","o_proj","up_proj",
        "down_proj","gate_proj","embed_tokens","lm_head"
    ],
    #llm hyperparams :
    train_on_inputs: bool = False,  #if False, masks out inputs in loss
    add_eos_token: bool = False,
    group_by_length: bool = False,  #faster, but produces an odd training loss curve
    resume_from_checkpoint: str = None,
    use_chat_prompt: bool = False, # whether to use the prompt for multi-turn conversation
    prompt_template_name: str = "Alpaca"
   
):
    print(
        f"Training Alpaca-LoRA model with params:\n"
        f"base_model: {base_model}\n"
        f"data_path: {data_path}\n"
        f"output_dir: {output_dir}\n"
        f"batch_size: {batch_size}\n"
        f"micro_batch_size: {micro_batch_size}\n"
        f"num_epochs: {num_epochs}\n"
        f"learning_rate: {learning_rate}\n"
        f"cutoff_len: {cutoff_len}\n"
        f"val_set_size: {val_set_size}\n"
        f"lora_r: {lora_r}\n"
        f"lora_alpha: {lora_alpha}\n"
        f"lora_dropout: {lora_dropout}\n"
        f"lora_target_modules: {lora_target_modules}\n"
        f"train_on_inputs: {train_on_inputs}\n"
        f"add_eos_token: {add_eos_token}\n"
        f"group_by_length: {group_by_length}\n"
        f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
        f"prompt template: {prompt_template_name}\n"
    )
    
    gradient_accumulation_steps = batch_size // micro_batch_size
    
    prompter = Prompter()

    model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=False,
        torch_dtype=torch.float16,
    )

    tokenizer = LlamaTokenizer.from_pretrained(base_model,legacy=False)

    tokenizer.pad_token_id = (
        0  # unk. we want this to be different from the eos token
    )
    tokenizer.padding_side = "left"  # Allow batched inference

    def tokenize(prompt, add_eos_token=True):
        result = tokenizer(
            prompt,
            truncation=True,
            max_length=cutoff_len,
            padding=False,
            return_tensors=None,
        )
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < cutoff_len
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()

        return result

    def generate_and_tokenize_prompt(data_point):
        if use_chat_prompt:
            assert "messages" in data_point
            full_prompt = prompter.generate_chat_prompt(
                data_point["messages"]
            )
        else:
            full_prompt = prompter.generate_prompt(
                            data_point["instruction"], #Instruction
                            data_point["text"], #Input 
                            data_point["table"], #Output
                            )
        tokenized_full_prompt = tokenize(full_prompt)
        
        if not train_on_inputs:
            if use_chat_prompt:
                user_prompt = prompter.generate_chat_prompt(
                    data_point["messages"][:-1]
                )
                user_prompt += "<|assistant|>\n"
            else:
                user_prompt = prompter.generate_prompt(
                    data_point["instruction"], 
                    data_point["input"]
                )
            
            tokenized_user_prompt = tokenize(
                user_prompt, add_eos_token=add_eos_token
            )
            user_prompt_len = len(tokenized_user_prompt["input_ids"])

            if add_eos_token:
                user_prompt_len -= 1

            tokenized_full_prompt["labels"] = [
                -100
            ] * user_prompt_len + tokenized_full_prompt["labels"][
                user_prompt_len:
            ]  # could be sped up, probably

        return tokenized_full_prompt
    
    # For int8
    # model = prepare_model_for_int8_training(model)

    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_target_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)

    if data_path.endswith(".json") or data_path.endswith(".jsonl"):
        data = load_dataset("json", data_files=data_path)
    else:
        data = load_dataset(data_path)
    
#     #Checkpoints :
#     if resume_from_checkpoint:
#         # Check the available weights and load them
#         checkpoint_name = os.path.join(
#             resume_from_checkpoint, "pytorch_model.bin"
#         )  # Full checkpoint
#         if not os.path.exists(checkpoint_name):
#             checkpoint_name = os.path.join(
#                 resume_from_checkpoint, "adapter_model.bin"
#             )  # only LoRA model - LoRA config above has to fit
#             resume_from_checkpoint = (
#                 False  # So the trainer won't try loading its state
#             )
#         # The two files above have a different name depending on how they were saved, but are actually the same.
#         if os.path.exists(checkpoint_name):
#             print(f"Restarting from {checkpoint_name}")
#             adapters_weights = torch.load(checkpoint_name)
#             set_peft_model_state_dict(model, adapters_weights)
#         else:
#             print(f"Checkpoint {checkpoint_name} not found")

    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.

    #Split data : 
    if val_set_size > 0:
        train_val = data["train"].train_test_split(
            test_size=val_set_size, shuffle=True, seed=42
        )
        train_data = (
            train_val["train"].shuffle().map(generate_and_tokenize_prompt)
        )
        val_data = (
            train_val["test"].shuffle().map(generate_and_tokenize_prompt)
        )
    else:
        train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
        val_data = None

    #Compile Model : 
    trainer = transformers.Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            per_device_eval_batch_size=micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_ratio=0.03,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            fp16=False,
            logging_steps=10,
            optim="adamw_torch",
            save_strategy="epoch", #lưu checkpoint theo từng epoch
            eval_strategy="epoch" if val_set_size > 0 else "no",
            output_dir=output_dir,
            save_total_limit=10, #Giới hạn số lượng checkpoint được lưu.
            load_best_model_at_end=True if val_set_size > 0 else False,
            group_by_length=group_by_length,
            gradient_checkpointing=True,
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )
    model.config.use_cache = False
    
    #Training Model :
    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    model.save_pretrained(output_dir)
    
    print('Completion Training !')

In [None]:
fire.Fire(train)

Training Alpaca-LoRA model with params:
base_model: huggyllama/llama-7b
data_path: yahma/alpaca-cleaned
output_dir: /kaggle/working/OutputModel
batch_size: 128
micro_batch_size: 4
num_epochs: 3
learning_rate: 0.0003
cutoff_len: 256
val_set_size: 2000
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj', 'embed_tokens', 'lm_head']
train_on_inputs: True
add_eos_token: False
group_by_length: False
resume_from_checkpoint: False
prompt template: Alpaca



config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

trainable params: 20,566,016 || all params: 6,758,981,632 || trainable%: 0.3043


Map:   0%|          | 0/49760 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

2024-06-12 16:27:51.164261: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-12 16:27:51.164407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-12 16:27:51.303071: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011115984177777389, max=1.0…



Epoch,Training Loss,Validation Loss
