In [1]:
!pip cache purge

Files removed: 236


In [2]:
%%capture
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes

!pip install datasets
!pip install accelerate
!pip install peft

!pip install -U trl
!pip install --upgrade typing_extensions

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import PeftModel

from transformers import pipeline

In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
print(f"Allocated Memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Cached Memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

Allocated Memory: 0.00 MB
Cached Memory: 0.00 MB


In [4]:
dataset = load_dataset("csv", data_files='./train.csv', encoding = 'utf-8-sig')

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'input', 'output'],
        num_rows: 11263
    })
})


In [6]:
def generate_prompt(examples):
  messages = [
      f"input : {examples['input']}, output : {examples['output']}",
  ]

  prompt = "\n".join([m for m in messages]).strip()

  return {"prompt": prompt}

In [7]:
prompted_datasets = dataset["train"].map(
    generate_prompt,
    # remove_columns=dataset["train"].column_names,
)

In [8]:
print(prompted_datasets)

Dataset({
    features: ['ID', 'input', 'output', 'prompt'],
    num_rows: 11263
})


In [9]:
prompted_datasets['prompt'][0]

'input : 별 한 게토 았깝땀. 왜 싸람듯릭 펼 1캐를 쥰눈징 컥꺾폰 싸람믐롯섞 맒록 섧멍핥쟈닐 탯끎룐눈 녀뮤 퀼교... 야뭍툰 둠 변 닺씨 깍낄 싫훈 굣. 깸삥읊 20여 년 댜녁뵨 곧 중 쩨윌 귑푼 낙팠떤 곶., output : 별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.'

In [10]:
model_id = 'beomi/gemma-ko-7b'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [11]:
def tokenize_function(examples):
  tokenized = tokenizer(examples['prompt'], padding="max_length", truncation=True)
  return tokenized

tokenized_datasets = prompted_datasets.map(
    tokenize_function,
    batched=True,
    # num_proc=4  # Use 4 CPU cores
)

In [12]:
print(tokenized_datasets)

Dataset({
    features: ['ID', 'input', 'output', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 11263
})


In [13]:
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)

In [14]:
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'input', 'output', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 10136
    })
    test: Dataset({
        features: ['ID', 'input', 'output', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 1127
    })
})


In [15]:
ADAPTER_MODEL = "lora_adapter_7b_2"
model = AutoModelForCausalLM.from_pretrained("beomi/gemma-ko-7b", device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, 
                                  is_trainable=True, device_map='auto', torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [16]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params:,}")

non_trainable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
print(f"NOT Trainable Parameters: {non_trainable_params:,}")

Trainable Parameters: 3,211,264
NOT Trainable Parameters: 8,537,680,896


In [17]:
lora_config = LoraConfig(
    target_modules = ["q_proj", "k_proj"],
    # ["q_proj", "k_proj", "v_proj", "o_proj"],
    # target_modules = [
    #     "q_proj", "v_proj", "k_proj", "o_proj",
    #     "gate_proj", "down_proj", "up_proj"
    # ],
    # init_lora_weights = False,
    lora_alpha=32,
    lora_dropout=0.1,
    r=4,
    bias="none",
    task_type="CAUSAL_LM",
)

In [18]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    eval_strategy="epoch", # epoch 마다 eval
    eval_steps=100, # 모델의 평가 주기
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=10, # 학습률 스케줄링
    logging_strategy="epoch", # epoch 마다 log
    learning_rate=2e-4,
    group_by_length=True,
    fp16=True
)

In [19]:
trainer = SFTTrainer(
    model=model,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    args=training_args,
    peft_config=lora_config,
    formatting_func=lambda x: x['input_ids'],
    # compute_metrics=compute_metrics,
)



Converting train dataset to ChatML:   0%|          | 0/10136 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1127 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.7386,1.778242
2,1.6782,1.75737
3,1.6161,1.741204
4,1.5052,1.739605


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=1580, training_loss=1.61899427582946, metrics={'train_runtime': 1632.2358, 'train_samples_per_second': 31.049, 'train_steps_per_second': 0.968, 'total_flos': 4.324872732191539e+17, 'train_loss': 1.61899427582946})

In [21]:
ADAPTER_MODEL = "lora_adapter_7b_3"
trainer.model.save_pretrained(ADAPTER_MODEL)

In [22]:
model = AutoModelForCausalLM.from_pretrained("beomi/gemma-ko-7b", device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)
model.save_pretrained('finetune_weight_3')



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]



In [23]:
FINETUNE_MODEL = "./finetune_weight_3"

finetune_model = AutoModelForCausalLM.from_pretrained(
    FINETUNE_MODEL, device_map= {"":0})

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [24]:
pipe = pipeline(
    task="text-generation",
    model=finetune_model,
    tokenizer=tokenizer
)

Device set to use cuda:0


In [25]:
def query_to_prompt(query):
  messages = [
      {
          "role": "system",
          "content": (
              "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
              "Your task is to transform the given obfuscated Korean review into a clear, correct, "
              "and natural-sounding Korean review that reflects its original meaning. "
              "Spacing and word length in the output must be restored to the same as in the input. "
              "Do not provide any description. Print only in Korean."
          )
      },
      {
          "role": "user",
          "content": f"input : {query}, output : "
      },
  ]

  prompt = "\n".join([m["content"] for m in messages]).strip()

  return prompt

In [26]:
def restore_review(query, query_len):
  prompt = query_to_prompt(query)

  outputs = pipe(
      prompt,
      do_sample=True,
      temperature=0.2,
      top_p=0.9,
      max_new_tokens=len(query),
      eos_token_id=pipe.tokenizer.eos_token_id
  )

  generated_text = outputs[0]['generated_text']
  print(generated_text)
  result = generated_text[len(prompt):].strip()

  # clean
  result = result.split("'")[0]
  result = result.split("\n")[0]
  result = result[:query_len]

  return result

In [29]:
# query = dataset['train']['input'][25]
# query_len = len(dataset['train']['input'][25])
query = dataset['train']['input'][10]
query_len = len(dataset['train']['input'][10])
prompt = query_to_prompt(query)
print(prompt)

You are a helpful assistant specializing in restoring obfuscated Korean reviews. Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. Spacing and word length in the output must be restored to the same as in the input. Do not provide any description. Print only in Korean.
input : 편힘 30퓬 넒계 쉰효 밭았써 긷타린는 쭐 봤욺면써 먈 얀항교 윗딱갸 거우 좌훽전을 받았섞 둘얹갼닙 쭈짰쟝 윕균셔 만짤략곡 팔료 똘럽본냅뉘따. 키타림먼 않돼냘 헤돋 큰낳 타 톨럼뾰냅뉜따. 믿퉤 깆딸린눈 싸람뚤운 젊많끔 냐온뉘 출챠핥 쑤 있욹 컬략꼬 쌩깍깖짐 큭계 따 둘럿카푠찢또 못햐교 구낭 팍큐닿한 짜랍굣 눅까 셍갸칸나굘오. 끓쳇써약 야난튁꼬프예 쥬짯햐쿄 셧틀 탸쿄 욜략교 일악귀합닙닥. 군낭 철음붙떠 셔툴 따눈 촉읏로 않넵를 졺 핫턴찌! 쭈윤 낡 읽항씬는 커 앉수러윤갖 헷는떼 쩐헐... 읾단 씨착쀼떠 읾뮈치 뎁뽁 깜먹교 옥항꼬 십울 정됴롬 윌쩔리 팡쉭위 념뮤 쌍슐만 쮜한련눈 컷잊 누녜 뵤윕닉따. 구리코 눈 내린눈 겁 꼭 앉 봔토 됩닢따. 흩뿔립뜻 뽈뿜엾열셔 큰냥 엎는 궤 칼큼한 눅킴밉뉘댜., output :


In [30]:
result = restore_review(query, query_len)
print(result)

You are a helpful assistant specializing in restoring obfuscated Korean reviews. Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. Spacing and word length in the output must be restored to the same as in the input. Do not provide any description. Print only in Korean.
input : 편힘 30퓬 넒계 쉰효 밭았써 긷타린는 쭐 봤욺면써 먈 얀항교 윗딱갸 거우 좌훽전을 받았섞 둘얹갼닙 쭈짰쟝 윕균셔 만짤략곡 팔료 똘럽본냅뉘따. 키타림먼 않돼냘 헤돋 큰낳 타 톨럼뾰냅뉜따. 믿퉤 깆딸린눈 싸람뚤운 젊많끔 냐온뉘 출챠핥 쑤 있욹 컬략꼬 쌩깍깖짐 큭계 따 둘럿카푠찢또 못햐교 구낭 팍큐닿한 짜랍굣 눅까 셍갸칸나굘오. 끓쳇써약 야난튁꼬프예 쥬짯햐쿄 셧틀 탸쿄 욜략교 일악귀합닙닥. 군낭 철음붙떠 셔툴 따눈 촉읏로 않넵를 졺 핫턴찌! 쭈윤 낡 읽항씬는 커 앉수러윤갖 헷는떼 쩐헐... 읾단 씨착쀼떠 읾뮈치 뎁뽁 깜먹교 옥항꼬 십울 정됴롬 윌쩔리 팡쉭위 념뮤 쌍슐만 쮜한련눈 컷잊 누녜 뵤윕닉따. 구리코 눈 내린눈 겁 꼭 앉 봔토 됩닢따. 흩뿔립뜻 뽈뿜엾열셔 큰냥 엎는 궤 칼큼한 눅킴밉뉘댜., output : 편히 30분 넘게 신호 받아서 기다리는 줄 봤으면서 말 안하고 있다가 겨우 자리를 받아서 들어가니 주차장 입구서 만차라고 바로 돌려보냅니다. 기다리면 안되나 해도 그냥 다 돌려보냅니다. 미친 기다리는 사람들은 잠깐 밖으로 나오니 주차할 수 있을 거라고 생각할 거라고 쌩각할지 크게 다 둘러가보지도 못하고 그냥 빠꾸당한 차라고 느까 생각하나요. 그쳐서야 아난티코브에 주차하고 셔틀 타고 오라