In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
from peft import PeftModel, PeftConfig
from datasets import Dataset
import pandas as pd
import os

In [23]:
data_path = '...'   # my data path

# df = pd.read_json(data_path, lines=True)    # if jsonl file  
# df = pd.read_json(data_path)        # if json file
df = pd.read_csv(data_path)       # if csv file

my_data = Dataset.from_dict(df)

질문 답변 셋의 데이터 필요 -> 컬럼명 바꾸기: 질문 -> instruction / 답변 -> output

In [None]:
my_data = my_data.map(
lambda x: {'text': f"질문: {x['instruction']}\n답변: {x['output']}<|endoftext|>" }
)

my_data = my_data.map(lambda samples: tokenizer(samples["text"]), batched=True)

In [25]:
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}

In [26]:
model_id = "kfkas/Llama-2-ko-7b-Chat"      # 원하는 모델 허깅페이스에서 불러오기
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [28]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

DataParallelism : GPU가 여러 개일 때 균등하게 RAM 용량 부여

In [29]:
if not ddp and torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

In [30]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value", "xxx"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [32]:
# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=my_data,    # 나의 데이터로 설정
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        warmup_steps=200,
        # max_steps=1000, 
        num_train_epochs= 3,  
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        output_dir="output",
        optim="paged_adamw_8bit",
        ddp_find_unused_parameters=False if ddp else None
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,1.8617
200,1.5517
300,1.5432
400,1.489
500,1.5365
600,1.3363
700,1.3395
800,1.1892
900,1.0481
1000,1.0537




TrainOutput(global_step=1000, training_loss=1.3948916625976562, metrics={'train_runtime': 5355.7469, 'train_samples_per_second': 0.747, 'train_steps_per_second': 0.187, 'total_flos': 6.983893950824448e+16, 'train_loss': 1.3948916625976562, 'epoch': 4.0})

모델 저장

In [None]:
output_dir = 'output'
model.save_pretrained(output_dir)

In [None]:
model.eval()
model.config.use_cache = True  # silence the warnings. Please re-enable for inference!

In [None]:
def gen(x):
    q = f"### 질문: {x}\n\n### 답변:"
    # print(q)
    gened = model.generate(
        **tokenizer(
            q,
            return_tensors='pt',
            return_token_type_ids=False
        ).to('cuda'),
        max_new_tokens=300,     # 답변 양 (크면 더 많은 답변 대답)
        early_stopping=True,
        # do_sample=True,
        eos_token_id=2,
    )
    print(tokenizer.decode(gened[0]))

In [None]:
gen('대나무가 높을까? 아파트가 높을까?')