In [1]:
import os
import gc
import sys

## 데이터 관련 라이브러리 로드 

import pandas as pd 
import numpy as np
import re
from tqdm import tqdm

from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets


## LLM, 딥러닝  관련 라이브러리 로드 

import torch 

from transformers import AutoTokenizer #토크나이저 
from transformers import LlamaForCausalLM,  AutoModelForCausalLM
 # LLM 모델 
from transformers import BitsAndBytesConfig # 양자화 라이브러리 
from transformers import GenerationConfig
from transformers import DataCollatorForLanguageModeling

from peft import PeftModel
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training # 효율적 학습을 위한 라이브러리 , LORA 관련 라이브러리 
from transformers import Trainer, TrainingArguments # 학습 관련된 모델 


load dataset

In [62]:
#load dataset : maywell/ko_wikidata_QA
dataset = load_dataset('maywell/ko_wikidata_QA')
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 137505
    })
})

load tokenizer

In [2]:
base_model = "beomi/llama-2-ko-7b"
tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side = 'right')

In [63]:
dataset_prompt = """ ###

### %s 

### %s 

"""

In [64]:
def gen_prompt(element):
    return DatasetDict({'tmp_promt': dataset_prompt%(element['instruction'], element['output'])})


dataset['train'] = dataset['train'].map(gen_prompt)

In [71]:
lines = [promt['tmp_promt'] for promt in dataset['train']]
with open('../Dataset/data.txt', 'w') as f:
    f.writelines(lines)

check for mps device

In [42]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    
mps_device

device(type='mps')

In [43]:
## 4bit quantaziation (양자화 라이브러리가 m1 실리콘에는 지원하지 않기 때문에 모델을 따로 다운로드하여 양자화 진행한 후 mps 환경에서 훈련합니다.)

bnb_4bit_compute_dtype = "bfloat16"
use_4bit = True


In [44]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
compute_dtype

torch.bfloat16

In [45]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.bfloat16 and use_4bit:
    major = torch.mps.driver_allocated_memory()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


Apply quantization on model and save it

In [None]:

model = LlamaForCausalLM.from_pretrained(base_model)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) #freeze

output_dir = "./prac/output"

model.save_pretrained(output_dir)
model_path = os.path.join(output_dir, "pytorch_model.bin")
tokenizer.save_pretrained(output_dir)

torch.save({}, model_path)

In [6]:
!pwd

/Users/inhwancho/Desktop/dino_ai/dino_2nd_LLM/training


In [73]:
import os
os.chdir("../../llama.cpp/")

In [None]:
!python convert.py ../dino_2nd_LLM/training/prac/output

In [None]:
!./quantize ../dino_2nd_LLM/training/prac/output/ggml-model-f32.gguf ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf Q4_0

extract a LoRA adapter

In [6]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        inference_mode=False, # 학습하는지  
                        r=16, # 작을 수록 trainable 한 파라미터의 개수가 낮아진다
                        lora_alpha=16,  # scaling factor 
                        lora_dropout=0.1) # dropout 

model = get_peft_model(model, peft_config) #불가

In [None]:
output_dir = "./prac/LoRA"

model.save_pretrained(output_dir)
model_path = os.path.join(output_dir, "pytorch_model.bin")
tokenizer.save_pretrained(output_dir)

torch.save({}, model_path)

In [9]:
!python convert-lora-to-ggml.py ../dino_2nd_LLM/training/prac/LoRA

model.layers.0.self_attn.q_proj => blk.0.attn_q.weight.loraA (4096, 16) float32 0.25MB
model.layers.0.self_attn.q_proj => blk.0.attn_q.weight.loraB (4096, 16) float32 0.25MB
model.layers.0.self_attn.v_proj => blk.0.attn_v.weight.loraA (4096, 16) float32 0.25MB
model.layers.0.self_attn.v_proj => blk.0.attn_v.weight.loraB (4096, 16) float32 0.25MB
model.layers.1.self_attn.q_proj => blk.1.attn_q.weight.loraA (4096, 16) float32 0.25MB
model.layers.1.self_attn.q_proj => blk.1.attn_q.weight.loraB (4096, 16) float32 0.25MB
model.layers.1.self_attn.v_proj => blk.1.attn_v.weight.loraA (4096, 16) float32 0.25MB
model.layers.1.self_attn.v_proj => blk.1.attn_v.weight.loraB (4096, 16) float32 0.25MB
model.layers.10.self_attn.q_proj => blk.10.attn_q.weight.loraA (4096, 16) float32 0.25MB
model.layers.10.self_attn.q_proj => blk.10.attn_q.weight.loraB (4096, 16) float32 0.25MB
model.layers.10.self_attn.v_proj => blk.10.attn_v.weight.loraA (4096, 16) float32 0.25MB
model.layers.10.self_attn.v_proj => b

In [19]:
! ./main -m ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf --lora ../dino_2nd_LLM/training/prac/LoRA/ggml-adapter-model.bin

Log start
main: build = 2616 (75cd4c77)
main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.4.0
main: seed  = 1712591948
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = prac
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 46336
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32           

In [76]:
os.chdir("/Users/inhwancho/Desktop/dino_ai/dino_2nd_LLM/training")


In [2]:
from llama_cpp import Llama

model_path = "./prac/output/ggml-model-f32_q4_0.gguf"
model = Llama(model_path = model_path,
              n_ctx = 2048,            # context window size
              n_gpu_layers = 1,        # enable GPU
              use_mlock = True)        # enable memory lock so not swap

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ./prac/output/ggml-model-f32_q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = prac
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 46336
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   7:                 llama.rope.dimension_count u

load model

In [6]:
prompt = """
###캐모마일은 어떤 식물인가요?

###
"""

output = model(prompt = prompt, max_tokens = 120, temperature = 0.2)
output


llama_print_timings:        load time =    1012.99 ms
llama_print_timings:      sample time =      13.38 ms /   120 runs   (    0.11 ms per token,  8970.62 tokens per second)
llama_print_timings: prompt eval time =    1012.95 ms /    18 tokens (   56.27 ms per token,    17.77 tokens per second)
llama_print_timings:        eval time =    8803.01 ms /   119 runs   (   73.97 ms per token,    13.52 tokens per second)
llama_print_timings:       total time =   10090.26 ms /   137 tokens


{'id': 'cmpl-7bde12b1-22d1-41b2-971d-36e4bff4d5b3',
 'object': 'text_completion',
 'created': 1712637524,
 'model': 'prac/output/ggml-model-f32_q4_0.gguf',
 'choices': [{'text': '\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b\u200b',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 18

In [None]:
! ./main -m ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf --lora ../dino_2nd_LLM/training/prac/LoRA/ggml-adapter-model.bin

In [74]:
!finetune --model-base ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf --checkpoint-in  ../dino_2nd_LLM/training/prac/LoRA/ggml-adapter-model.bin --checkpoint-out ../dino_2nd_LLM/training/prac/LoRA/checkpoint-ITERATION.gguf --lora-out ../dino_2nd_LLM/training/prac/LoRA/ggml-lora-ITERATION-f32.gguf --train-data /Users/inhwancho/Desktop/dino_ai/dino_2nd_LLM/Dataset/data.txt --save-every 300 --threads 6 --adam-iter 30 --batch 4 --ctx 64 --use-checkpointing


main: seed: 1712635283
main: model base = '../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf'
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = prac
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 46336
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:              

In [5]:
from llama_cpp import Llama

model_path = "prac/output/ggml-model-f32_q4_0.gguf"
model = Llama(model_path = model_path,
              n_ctx = 2048,            # context window size
              n_gpu_layers = 1,        # enable GPU
              use_mlock = True)        # enable memory lock so not swap

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from prac/output/ggml-model-f32_q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = prac
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 46336
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   7:                 llama.rope.dimension_count u32

In [75]:
# predict
!main -m ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf --lora ../dino_2nd_LLM/training/prac/LoRA/ggml-lora-LATEST-f32.gguf

Log start
main: build = 2616 (75cd4c77)
main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.4.0
main: seed  = 1712636494
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ../dino_2nd_LLM/training/prac/output/ggml-model-f32_q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = prac
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 46336
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32           