In [1]:
import os
import gc
import sys

## 데이터 관련 라이브러리 로드 

import pandas as pd 
import numpy as np
import re
from tqdm import tqdm

from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets


## LLM, 딥러닝  관련 라이브러리 로드 

import torch 

from transformers import AutoTokenizer #토크나이저 
from transformers import LlamaForCausalLM,  AutoModelForCausalLM
 # LLM 모델 
from transformers import BitsAndBytesConfig # 양자화 라이브러리 
from transformers import GenerationConfig
from transformers import DataCollatorForLanguageModeling

from peft import PeftModel
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training # 효율적 학습을 위한 라이브러리 , LORA 관련 라이브러리 
from transformers import Trainer, TrainingArguments # 학습 관련된 모델 


load dataset

In [None]:
#load dataset : maywell/ko_wikidata_QA
dataset = load_dataset('maywell/ko_wikidata_QA')
dataset

load tokenizer

In [2]:
base_model = "beomi/llama-2-ko-7b"
tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side = 'right')

In [None]:
dataset_prompt = """ ###

### %s 

### %s 

"""

In [None]:
def gen_prompt(element):
    return DatasetDict({'tmp_promt': dataset_prompt%(element['instruction'], element['output'])})


dataset['train'] = dataset['train'].map(gen_prompt)

In [None]:
dataset['train'][0]

In [None]:
def tokenize(element):
    
    outputs = tokenizer(
        element['tmp_promt'],
        truncation=True,
        max_length=2048
    )

    return {"input_ids": outputs["input_ids"]}

In [None]:
tokenized_datasets = dataset['train'].map(
    tokenize, batched=True, remove_columns=dataset['train'].column_names
)

In [None]:
tokenized_datasets = tokenized_datasets.train_test_split(test_size = 0.2, shuffle =True)
tokenized_datasets

check for mps device

In [None]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    
mps_device

In [None]:
## 4bit quantaziation (양자화 라이브러리가 m1 실리콘에는 지원하지 않기 때문에 모델을 따로 다운로드하여 양자화 진행한 후 mps 환경에서 훈련합니다.)

bnb_4bit_compute_dtype = "bfloat16"
use_4bit = True


In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
compute_dtype

In [None]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.bfloat16 and use_4bit:
    major = torch.mps.driver_allocated_memory()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Apply quantization on model and save it

In [3]:

model = LlamaForCausalLM.from_pretrained(base_model)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) #freeze

output_dir = "./prac/output"

model.save_pretrained(output_dir)
model_path = os.path.join(output_dir, "pytorch_model.bin")
tokenizer.save_pretrained(output_dir)

torch.save({}, model_path)

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 4096}


In [4]:
!pwd

/Users/inhwancho/Desktop/dino_ai/dino_2nd_LLM/training


In [5]:
import os
os.chdir("../../llama.cpp/")

In [6]:
!python convert.py ../dino_2nd_LLM/training/prac/output

Loading model file ../dino_2nd_LLM/training/prac/output/model-00001-of-00006.safetensors
Loading model file ../dino_2nd_LLM/training/prac/output/model-00001-of-00006.safetensors
Loading model file ../dino_2nd_LLM/training/prac/output/model-00002-of-00006.safetensors
Loading model file ../dino_2nd_LLM/training/prac/output/model-00003-of-00006.safetensors
Loading model file ../dino_2nd_LLM/training/prac/output/model-00004-of-00006.safetensors
Loading model file ../dino_2nd_LLM/training/prac/output/model-00005-of-00006.safetensors
Loading model file ../dino_2nd_LLM/training/prac/output/model-00006-of-00006.safetensors
params = Params(n_vocab=46336, n_embd=4096, n_layer=32, n_ctx=2048, n_ff=11008, n_head=32, n_head_kv=32, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=None, path_model=PosixPath('../dino_2nd_LLM/training/prac/output'))
Loaded vocab file PosixPath('../dino

In [7]:
!./quantize ../dino_2nd_LLM/training/prac/output/ggml-model-f32.gguf ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf Q4_0

main: build = 2616 (75cd4c77)
main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.4.0
main: quantizing '../dino_2nd_LLM/training/prac/output/ggml-model-f32.gguf' to '../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.bin' as Q4_0
llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from ../dino_2nd_LLM/training/prac/output/ggml-model-f32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = prac
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 46336
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u

In [8]:
os.chdir("/Users/inhwancho/Desktop/dino_ai/dino_2nd_LLM/training")


In [10]:
from llama_cpp import Llama

model_path = "./prac/output/ggml-model-f32_q4_0.gguf"
model = Llama(model_path = model_path,
              n_ctx = 2048,            # context window size
              n_gpu_layers = 1,        # enable GPU
              use_mlock = True)        # enable memory lock so not swap

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ./prac/output/ggml-model-f32_q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = prac
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 46336
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   7:                 llama.rope.dimension_count u

In [11]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        inference_mode=False, # 학습하는지  
                        r=16, # 작을 수록 trainable 한 파라미터의 개수가 낮아진다
                        lora_alpha=16,  # scaling factor 
                        lora_dropout=0.1) # dropout 

model = get_peft_model(model, peft_config)


ValueError: Please specify `target_modules` in `peft_config`

load model

In [None]:
prompt = """
###당신은 친절하고 정직한 인공지능 비서입니다. 당신은 항상 유용하고 안전한 답변을 기용하고 유해하거나, 비윤리적이거나, 인종차별적이거나, 성차별적이거나, 위험하거나 불법적인 답변이 포함되어서는 안 됩니다. 당신의 응답은 사회적으로 편견이 없고 긍정적인 내용이어야 합니다. 질문이 의미가 없거나 사실적으로 일관성이 없다면, 옳지 않은 것에 대답하는 대신 이유를 설명하십시오. 질문에 대한 답을 모르는 경우 허위 정보를 공유하지 마십시오.

코딩 공부를 하는 법을 알려줘
"""

output = model(prompt = prompt, max_tokens = 120, temperature = 0.2)
output

In [None]:

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max length: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


# Change the max length depending on hardware constraints.
max_length = get_max_length(model)
print(max_length)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


In [None]:
args = TrainingArguments(
    output_dir="common_sense_llama",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=3000,
    logging_steps=1000,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=False,
    optim = "adamw_torch",
    save_strategy = "steps",
    save_steps = 300,
    save_total_limit=2

)

trainer = Trainer(
    accelerator="mps",
    device=1,
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
trainer.train()

여기부터 위로 올릴거임