In [1]:
!pip cache purge

Files removed: 40


In [3]:
%%capture
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes

In [4]:
%%capture
!pip install datasets
!pip install accelerate
!pip install peft

In [5]:
%%capture
!pip install -U trl

In [22]:
%%capture
!pip install --upgrade typing_extensions

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [55]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [56]:
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import PeftModel

In [57]:
from transformers import pipeline

In [58]:
dataset = load_dataset("csv", data_files='./train.csv', encoding = 'utf-8-sig')

In [59]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'input', 'output'],
        num_rows: 11263
    })
})


In [60]:
def generate_prompt(examples):
  messages = [
      {
          "role": "system",
          "content": (
              "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
              "Your task is to transform the given obfuscated Korean review into a clear, correct, "
              "and natural-sounding Korean review that reflects its original meaning. "
              "Spacing and word length in the output must be restored to the same as in the input. "
              "Do not provide any description. Print only in Korean."
          )
      },
      {
          "role": "user",
          "content": f"input : {examples['input']}, output : {examples['output']}"
      },
  ]

  prompt = "\n".join([ # f"<start_of_turn> Role : {m['role']}, \n {m['content']} <end_of_turn>"
                      m['content']
                      # f"<start_of_turn>{m['content']}<end_of_turn>"
                      for m in messages]
                    ).strip()

  return {"prompt": prompt}

In [61]:
prompted_datasets = dataset["train"].map(
    generate_prompt,
    # remove_columns=dataset["train"].column_names,
)

In [62]:
print(prompted_datasets)

Dataset({
    features: ['ID', 'input', 'output', 'prompt'],
    num_rows: 11263
})


In [63]:
prompted_datasets['prompt'][0]

'You are a helpful assistant specializing in restoring obfuscated Korean reviews. Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. Spacing and word length in the output must be restored to the same as in the input. Do not provide any description. Print only in Korean.\ninput : 별 한 게토 았깝땀. 왜 싸람듯릭 펼 1캐를 쥰눈징 컥꺾폰 싸람믐롯섞 맒록 섧멍핥쟈닐 탯끎룐눈 녀뮤 퀼교... 야뭍툰 둠 변 닺씨 깍낄 싫훈 굣. 깸삥읊 20여 년 댜녁뵨 곧 중 쩨윌 귑푼 낙팠떤 곶., output : 별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.'

In [64]:
model_id = 'beomi/gemma-ko-7b'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [65]:
def tokenize_function(examples):
  tokenized = tokenizer(examples['prompt'], padding="max_length", truncation=True)
  return tokenized

tokenized_datasets = prompted_datasets.map(
    tokenize_function,
    batched=True,
    # num_proc=4  # Use 4 CPU cores
)

In [66]:
print(tokenized_datasets)

Dataset({
    features: ['ID', 'input', 'output', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 11263
})


In [67]:
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)

In [68]:
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'input', 'output', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 10136
    })
    test: Dataset({
        features: ['ID', 'input', 'output', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 1127
    })
})


In [15]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model_id = 'beomi/gemma-ko-7b'
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = bnb_config, device_map={"":0})

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/2.99G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [16]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params:,}")

non_trainable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
print(f"NOT Trainable Parameters: {non_trainable_params:,}")

Trainable Parameters: 786,607,104
NOT Trainable Parameters: 3,875,536,896


In [17]:
lora_config = LoraConfig(
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    # target_modules = [
    #     "q_proj", "v_proj", "k_proj", "o_proj",
    #     "gate_proj", "down_proj", "up_proj"
    # ],
    # init_lora_weights = False,
    # lora_alpha=16,
    lora_alpha=32,
    lora_dropout=0.1,
    # r=64,
    # r=8,
    r=4,
    bias="none",
    task_type="CAUSAL_LM",
)

In [18]:
model = get_peft_model(model, lora_config)

In [19]:
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 3072, padding_idx=0)
        (layers): ModuleList(
          (0-27): 28 x GemmaDecoderLayer(
            (self_attn): GemmaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (

In [31]:
model.print_trainable_parameters()

trainable params: 0 || all params: 8,540,892,160 || trainable%: 0.0000


In [24]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    eval_strategy="epoch", # epoch 마다 eval
    eval_steps=100, # 모델의 평가 주기
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=10, # 학습률 스케줄링
    logging_strategy="epoch", # epoch 마다 log
    learning_rate=2e-4,
    group_by_length=True,
    fp16=True
)

In [25]:
trainer = SFTTrainer(
    model=model,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    args=training_args,
    peft_config=lora_config,
    formatting_func=lambda x: x['input_ids'],
    # compute_metrics=compute_metrics,
)



Converting train dataset to ChatML:   0%|          | 0/10136 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1127 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.5856,1.383088
2,1.2921,1.278292
3,1.2099,1.233846
4,1.1311,1.2023


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=1580, training_loss=1.275977817969986, metrics={'train_runtime': 3218.111, 'train_samples_per_second': 15.748, 'train_steps_per_second': 0.491, 'total_flos': 6.009973377429504e+17, 'train_loss': 1.275977817969986})

In [29]:
ADAPTER_MODEL = "lora_adapter_7b"
trainer.model.save_pretrained(ADAPTER_MODEL)

In [30]:
model = AutoModelForCausalLM.from_pretrained("beomi/gemma-ko-7b", device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)
model.save_pretrained('finetune_weight_1')

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [70]:
FINETUNE_MODEL = "./finetune_weight_1"

finetune_model = AutoModelForCausalLM.from_pretrained(
    FINETUNE_MODEL, device_map= {"":0})

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [72]:
pipe = pipeline(
    task="text-generation",
    model=finetune_model,
    tokenizer=tokenizer
)

Device set to use cuda:0


In [73]:
def query_to_prompt(query):
  messages = [
      {
          "role": "system",
          "content": (
              "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
              "Your task is to transform the given obfuscated Korean review into a clear, correct, "
              "and natural-sounding Korean review that reflects its original meaning. "
              "Spacing and word length in the output must be restored to the same as in the input. "
              "Do not provide any description. Print only in Korean."
          )
      },
      {
          "role": "user",
          "content": f"input : {query}, output : "
      },
  ]

  prompt = "\n".join([m["content"] for m in messages]).strip()

  return prompt

In [74]:
def restore_review(query, query_len):
  prompt = query_to_prompt(query)

  outputs = pipe(
      prompt,
      do_sample=True,
      temperature=0.2,
      top_p=0.9,
      max_new_tokens=len(query),
      eos_token_id=pipe.tokenizer.eos_token_id
  )

  generated_text = outputs[0]['generated_text']
  print(generated_text)
  result = generated_text[len(prompt):].strip()

  # clean
  result = result.split("'")[0]
  result = result.split("\n")[0]
  result = result[:query_len]

  return result

In [75]:
# query = dataset['train']['input'][25]
# query_len = len(dataset['train']['input'][25])
query = dataset['train']['input'][10]
query_len = len(dataset['train']['input'][10])
prompt = query_to_prompt(query)
print(prompt)

You are a helpful assistant specializing in restoring obfuscated Korean reviews. Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. Spacing and word length in the output must be restored to the same as in the input. Do not provide any description. Print only in Korean.
input : 편힘 30퓬 넒계 쉰효 밭았써 긷타린는 쭐 봤욺면써 먈 얀항교 윗딱갸 거우 좌훽전을 받았섞 둘얹갼닙 쭈짰쟝 윕균셔 만짤략곡 팔료 똘럽본냅뉘따. 키타림먼 않돼냘 헤돋 큰낳 타 톨럼뾰냅뉜따. 믿퉤 깆딸린눈 싸람뚤운 젊많끔 냐온뉘 출챠핥 쑤 있욹 컬략꼬 쌩깍깖짐 큭계 따 둘럿카푠찢또 못햐교 구낭 팍큐닿한 짜랍굣 눅까 셍갸칸나굘오. 끓쳇써약 야난튁꼬프예 쥬짯햐쿄 셧틀 탸쿄 욜략교 일악귀합닙닥. 군낭 철음붙떠 셔툴 따눈 촉읏로 않넵를 졺 핫턴찌! 쭈윤 낡 읽항씬는 커 앉수러윤갖 헷는떼 쩐헐... 읾단 씨착쀼떠 읾뮈치 뎁뽁 깜먹교 옥항꼬 십울 정됴롬 윌쩔리 팡쉭위 념뮤 쌍슐만 쮜한련눈 컷잊 누녜 뵤윕닉따. 구리코 눈 내린눈 겁 꼭 앉 봔토 됩닢따. 흩뿔립뜻 뽈뿜엾열셔 큰냥 엎는 궤 칼큼한 눅킴밉뉘댜., output :


In [76]:
result = restore_review(query, query_len)
print(result)

You are a helpful assistant specializing in restoring obfuscated Korean reviews. Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. Spacing and word length in the output must be restored to the same as in the input. Do not provide any description. Print only in Korean.
input : 편힘 30퓬 넒계 쉰효 밭았써 긷타린는 쭐 봤욺면써 먈 얀항교 윗딱갸 거우 좌훽전을 받았섞 둘얹갼닙 쭈짰쟝 윕균셔 만짤략곡 팔료 똘럽본냅뉘따. 키타림먼 않돼냘 헤돋 큰낳 타 톨럼뾰냅뉜따. 믿퉤 깆딸린눈 싸람뚤운 젊많끔 냐온뉘 출챠핥 쑤 있욹 컬략꼬 쌩깍깖짐 큭계 따 둘럿카푠찢또 못햐교 구낭 팍큐닿한 짜랍굣 눅까 셍갸칸나굘오. 끓쳇써약 야난튁꼬프예 쥬짯햐쿄 셧틀 탸쿄 욜략교 일악귀합닙닥. 군낭 철음붙떠 셔툴 따눈 촉읏로 않넵를 졺 핫턴찌! 쭈윤 낡 읽항씬는 커 앉수러윤갖 헷는떼 쩐헐... 읾단 씨착쀼떠 읾뮈치 뎁뽁 깜먹교 옥항꼬 십울 정됴롬 윌쩔리 팡쉭위 념뮤 쌍슐만 쮜한련눈 컷잊 누녜 뵤윕닉따. 구리코 눈 내린눈 겁 꼭 앉 봔토 됩닢따. 흩뿔립뜻 뽈뿜엾열셔 큰냥 엎는 궤 칼큼한 눅킴밉뉘댜., output : 편히 30분 넓게 쉬려고 기다리는 줄 봤으면서 말 안하고 있다가 거의 자리가 났을 때 뛰어보냅니다. 기다리면 안된다 해도 그냥 다 뛰어보냅니다. 믿을 수 있을까 나오니까 주차할 수 있을 거라고 생각할 겨 같게 다 뛰어가보지도 못하고 그냥 박구당한 차라고 누가 생각하나요. 글쎄써야 아난티코브에 주차하고 셔틀 타고 올라고 이야기합니다. 그냥 처음부터 셔틀 다는 쪽으로 안내를 좀 하던지! 주

In [77]:
print(result)

편히 30분 넓게 쉬려고 기다리는 줄 봤으면서 말 안하고 있다가 거의 자리가 났을 때 뛰어보냅니다. 기다리면 안된다 해도 그냥 다 뛰어보냅니다. 믿을 수 있을까 나오니까 주차할 수 있을 거라고 생각할 겨 같게 다 뛰어가보지도 못하고 그냥 박구당한 차라고 누가 생각하나요. 글쎄써야 아난티코브에 주차하고 셔틀 타고 올라고 이야기합니다. 그냥 처음부터 셔틀 다는 쪽으로 안내를 좀 하던지! 주운 날 일하시는 거 안쓰러운가 했는데 전혀... 일단 시착부터 이미지 대표 감먹고 욕하고 싶을 정도로 일처리 빵식이 너무 상술만 찌한려는 것이 눈에 보입니다. 그리고 눈 내리는 거 꼭 안 봐도 됩니다. 허브리듯 폭풍여서 그냥 없는 게 깔끔한 느낌입니다.
