# **Ver. 1 (폐기코드)**

In [None]:
!pip install datasets
!pip install transformers
!pip install peft

In [None]:
!pip install trl

In [None]:
# coding=utf-8
import os
import torch
from dataclasses import dataclass, field
from typing import Optional
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
)
from trl import SFTTrainer

In [None]:
@dataclass
class ScriptArguments:
    local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
    per_device_train_batch_size: Optional[int] = field(default=4)
    gradient_accumulation_steps: Optional[int] = field(default=4)
    learning_rate: Optional[float] = field(default=2e-4)
    max_grad_norm: Optional[float] = field(default=0.3)
    lora_alpha: Optional[int] = field(default=16)
    lora_dropout: Optional[float] = field(default=0.1)
    lora_r: Optional[int] = field(default=64)
    max_seq_length: Optional[int] = field(default=512)
    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf") # 모델
    dataset_name: Optional[str] = field(default="boolq") # from datasets import load_dataset \ load_dataset("boolq")
    use_4bit: Optional[bool] = field(default=True) # 4 비트 사용
    use_nested_quant: Optional[bool] = field(default=False)
    bnb_4bit_compute_dtype: Optional[str] = field(default="float16")
    bnb_4bit_quant_type: Optional[str] = field(default="nf4")
    num_train_epochs: Optional[int] = field(default=1) # 1 에포크 기본 설정
    fp16: Optional[bool] = field(default=False)
    bf16: Optional[bool] = field(default=False)
    packing: Optional[bool] = field(default=False)
    gradient_checkpointing: Optional[bool] = field(default=True)
    optim: Optional[str] = field(default="paged_adamw_32bit") # 옵티마이저
    lr_scheduler_type: str = field(default="constant")
    max_steps: int = field(default=10000)
    warmup_ratio: float = field(default=0.03)
    group_by_length: bool = field(default=True)
    save_steps: int = field(default=10)
    logging_steps: int = field(default=10)
    merge_and_push: Optional[bool] = field(default=False)
    output_dir: str = field(default="./results") # output directory 설정

In [None]:
def create_and_prepare_model(args):
    compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=args.use_4bit,
        bnb_4bit_quant_type=args.bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=args.use_nested_quant,
    )

    # Load the entire model onto GPU 0
    # Switch to device_map = "auto" for multi-GPU configurations
    device_map = {"": 0}
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        use_auth_token=True
    )

    model.config.pretraining_tp = 1
    peft_config = LoraConfig(
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        r=args.lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    return model, peft_config, tokenizer


In [None]:
def train_model(args):
    model, peft_config, tokenizer = create_and_prepare_model(args)
    model.config.use_cache = False
    dataset = load_dataset(args.dataset_name, split="train")

    # Fix the unusual overflow issue in fp16 training.
    tokenizer.padding_side = "right"

    training_arguments = TrainingArguments(
        output_dir=args.output_dir,
        per_device_train_batch_size=args.per_device_train_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        optim=args.optim,
        save_steps=args.save_steps,
        logging_steps=args.logging_steps,
        learning_rate=args.learning_rate,
        fp16=args.fp16,
        bf16=args.bf16,
        max_grad_norm=args.max_grad_norm,
        max_steps=args.max_steps,
        warmup_ratio=args.warmup_ratio,
        group_by_length=args.group_by_length,
        lr_scheduler_type=args.lr_scheduler_type,
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=args.max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=args.packing,
    )

    trainer.train()

    if args.merge_and_push:
        output_dir = os.path.join(args.output_dir, "final_checkpoints")
        trainer.model.save_pretrained(output_dir)

        # Free up memory for merging weights
        del model
        torch.cuda.empty_cache()

        from peft import AutoPeftModelForCausalLM
        model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
        model = model.merge_and_unload()
        output_merged_dir = os.path.join(args.output_dir, "final_merged_checkpoint")
        model.save_pretrained(output_merged_dir, safe_serialization=True)


In [None]:
if __name__ == "__main__":
    parser = HfArgumentParser(ScriptArguments)
    script_args = parser.parse_args_into_dataclasses()[0]
    train_model(script_args)

# **Ver 2.**

In [1]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install bitsandbytes
!pip install accelerate

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
import bitsandbytes
import accelerate

In [3]:
import huggingface_hub

In [4]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
max_length = 128

# Model loading params
load_in_4bit = True

# LoRA Params
lora_alpha = 16             # How much to weigh LoRA params over pretrained params
lora_dropout = 0.1          # Dropout for LoRA weights to avoid overfitting
lora_r = 16                 # Bottleneck size between A and B matrix for LoRA params
lora_bias = "all"           # "all" or "none" for LoRA bias
model_type = "llama"     # falcon or llama
lora_target_modules = [     # Which modules to apply LoRA to (names of the modules in state_dict)
    "query_key_value",
    "dense",
    "dense_h_to_4h",
    "dense_4h_to_h",
] if model_type == "falcon" else [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
]

# Trainer params
output_dir = "outputs"                              # Directory to save the model
optim_type = "adamw_8bit"                           # Optimizer type to train with
learning_rate = 0.0005                              # Model learning rate
weight_decay = 0.002                                # Model weight decay
per_device_train_batch_size = 1                     # Train batch size on each GPU
per_device_eval_batch_size = 1                      # Eval batch size on each GPU
gradient_accumulation_steps = 16                    # Number of steps before updating model
warmup_steps = 5                                    # Number of warmup steps for learning rate
save_steps = 100                                    # Number of steps before saving model
logging_steps = 100                                 # Number of steps before logging


In [None]:
# pip list

In [6]:
# Load in the model as a 4-bit or 8-bit model
if load_in_4bit == True:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        "tiiuae/falcon-7b" if model_type == "falcon" else "meta-llama/Llama-2-7b-hf",
        trust_remote_code=True,
        device_map="auto",
        quantization_config=bnb_config
    )

else:
    model = AutoModelForCausalLM.from_pretrained(
        "tiiuae/falcon-7b" if model_type == "falcon" else "meta-llama/Llama-2-7b-hf",
        trust_remote_code=True,
        device_map="auto",
        load_in_8bit=True,
    )

# Load in the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "tiiuae/falcon-7b" if model_type == "falcon" else "meta-llama/Llama-2-7b-hf",
    trust_remote_code=True,
)

tokenizer.pad_token = tokenizer.eos_token


Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [7]:
# Load in the dataset and map using the tokenizer
dataset = load_dataset("squad") # 데이터셋 수정
"""
The dataset has context, questions, and answers.

For this example, I am just encoding the question and first answer.
when you would actually want the context and question.

We want the text string to be in the format
#### Human: {question}#### Assistant: {output}

We want to turn this into the format:
{
    "input_ids": input ids for the encoded instruction and input
    "labels": This is the input ids, but we put -100 where we want to mask the
                loss. We want to mask the loss for the instruction, input, and padding.
                We use -100 because PyTorch CrossEntropy ignores -100 labels.
    "attention_mask": attention mask so the model doesn't attend to padding
}
"""

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

'\nThe dataset has context, questions, and answers.\n\nFor this example, I am just encoding the question and first answer.\nwhen you would actually want the context and question.\n\nWe want the text string to be in the format\n#### Human: {question}#### Assistant: {output}\n\nWe want to turn this into the format:\n{\n    "input_ids": input ids for the encoded instruction and input\n    "labels": This is the input ids, but we put -100 where we want to mask the\n                loss. We want to mask the loss for the instruction, input, and padding.\n                We use -100 because PyTorch CrossEntropy ignores -100 labels.\n    "attention_mask": attention mask so the model doesn\'t attend to padding\n}\n'

In [8]:
def map_function(example):
    # Get the question and model output
    question = f"#### Human: {example['question'].strip()}"
    output = f"#### Assistant: {example['answers']['text'][0].strip()}"

    # Encode the question and output
    question_encoded = tokenizer(question)
    output_encoded = tokenizer(output, max_length=max_length-len(question_encoded["input_ids"]), truncation=True, padding="max_length")

    # Combine the input ids
    input_ids = question_encoded["input_ids"] + output_encoded["input_ids"]

    # The labels are the input ids, but we want to mask the loss for the context and padding
    labels = [-100]*len(question_encoded["input_ids"]) + [output_encoded["input_ids"][i] if output_encoded["attention_mask"][i] == 1 else -100 for i in range(len(output_encoded["attention_mask"]))]

    # Combine the attention masks. Attention masks are 0
    # where we want to mask and 1 where we want to attend.
    # We want to attend to both context and generated output
    attention_mask = [1]*len(question_encoded["input_ids"]) + output_encoded["attention_mask"]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask
    }

data_train = dataset["train"].map(map_function)
data_test = dataset["validation"].map(map_function)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [9]:
# Adapt the model with LoRA weights
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias=lora_bias,
    task_type="CAUSAL_LM",
    inference_mode=False,
    target_modules=lora_target_modules
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch", # evaluation per epoch
    optim=optim_type,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    do_train=True,
    warmup_steps=warmup_steps,
    save_steps=save_steps,
    logging_steps=logging_steps,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_test,
    tokenizer=tokenizer,
)


trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.589770503135875


In [10]:
# Train the model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
# Get perplexity

result = trainer.evaluate()
print(result)

# **Inference**

In [None]:
# model_path = "outputs/merged_model"
# Path to the combined weights

# Prompt should be in this style due to how the data was created
prompt = "#### Human: What is the capital of South Korea?#### Assistant:"

'''
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True,
    )
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map=device,
    # load_in_8bit=True,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
'''

inputs = tokenizer(prompt, return_tensors="pt")
# if device != "cpu":
#   inputs = inputs.to('cuda')

inputs = inputs.to('cuda')

del inputs['token_type_ids']
output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=60, max_new_tokens=100) # generation option 주의
output = tokenizer.decode(output[0], skip_special_tokens=True)


print(output.split("#### Assistant: ")[1])


# **Test (Accuracy) -  미완성 코드, 추후 보완필요**

**squad 데이터셋에는 train / validation dataset 만 존재하기 때문에, squad_v2 dataset 의 일부를 활용하여 Test 수행**

In [None]:
dataset = load_dataset("squad_v2")
gt_dataset = dataset["train"]

In [None]:
def map_function_for_test(dataset, generation):
    # Get the question and model output
    question = f"#### Human: {dataset['question'].strip()}"
    output = f"#### Assistant: {generation}"

    # Encode the question and output
    question_encoded = tokenizer(question)
    output_encoded = tokenizer(output, max_length=max_length-len(question_encoded["input_ids"]), truncation=True, padding="max_length")

    # Combine the input ids
    input_ids = question_encoded["input_ids"] + output_encoded["input_ids"]

    # The labels are the input ids, but we want to mask the loss for the context and padding
    labels = [-100]*len(question_encoded["input_ids"]) + [output_encoded["input_ids"][i] if output_encoded["attention_mask"][i] == 1 else -100 for i in range(len(output_encoded["attention_mask"]))]

    # Combine the attention masks. Attention masks are 0
    # where we want to mask and 1 where we want to attend.
    # We want to attend to both context and generated output
    attention_mask = [1]*len(question_encoded["input_ids"]) + output_encoded["attention_mask"]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask
    }


In [None]:
def inference_for_test(model, dataset):

  inference_result = dataset["train"][:100].map(map_function_for_test) # 100 from the top
  prompt = f"#### Human: {What is the capital of South Korea?}#### Assistant:"

  inputs = tokenizer(prompt, return_tensors="pt")

  inputs = inputs.to('cuda')

  del inputs['token_type_ids']
  output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=60, max_new_tokens=100) # generation option 주의
  output = tokenizer.decode(output[0], skip_special_tokens=True)

  return

  print(output.split("#### Assistant: ")[1])

In [None]:
# Evaluation
import evaluate

accuracy_metric = evaluate.load("accuracy")
results = accuracy_metric.compute(references=[], predictions=[]) # 리스트 대체 필요

print(results)