In [1]:
!pip cache purge

Files removed: 40


In [2]:
%%capture
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes

In [3]:
%%capture
!pip install datasets
!pip install accelerate
!pip install peft

In [4]:
%%capture
!pip install -U trl

In [5]:
%%capture
!pip install --upgrade typing_extensions

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [2]:
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import PeftModel

In [3]:
from transformers import pipeline

In [34]:
dataset = load_dataset("csv", data_files='./train_augmented_2.csv', encoding = 'utf-8-sig')

Generating train split: 0 examples [00:00, ? examples/s]

In [35]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'input', 'output', 'restored', 'noised'],
        num_rows: 11263
    })
})


In [36]:
def generate_prompt(examples):
  messages = [
      f"input : {examples['input']}, output : {examples['output']}",
      f"input : {examples['restored']}, output : {examples['output']}",
      f"input : {examples['noised']}, output : {examples['output']}"
  ]

  prompt = "\n".join([m for m in messages]).strip()

  return {"prompt": prompt}

In [37]:
prompted_datasets = dataset["train"].map(
    generate_prompt,
    # remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/11263 [00:00<?, ? examples/s]

In [38]:
print(prompted_datasets)

Dataset({
    features: ['ID', 'input', 'output', 'restored', 'noised', 'prompt'],
    num_rows: 11263
})


In [39]:
prompted_datasets['prompt'][0]

'input : 별 한 게토 았깝땀. 왜 싸람듯릭 펼 1캐를 쥰눈징 컥꺾폰 싸람믐롯섞 맒록 섧멍핥쟈닐 탯끎룐눈 녀뮤 퀼교... 야뭍툰 둠 변 닺씨 깍낄 싫훈 굣. 깸삥읊 20여 년 댜녁뵨 곧 중 쩨윌 귑푼 낙팠떤 곶., output : 별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.\ninput : 별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 경험한 사람으로서 말로 설명하자니 텍스트로는 너무 길고... 아무튼 두 번 다시 가기 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳., output : 별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.\ninput : 별 한 개됴 야깝댜. 왜 샤람들이 별 1개를 쥬는지 겪여본 샤람으료셔 말료 설명햐쟈니 댓글료는 녀뮤 길교... 야뮤튼 듀 번 댜시 갸길 싫은 곳. 캠핑을 20어 년 댜너본 곳 중 제일 기분 냐빴던 곳., output : 별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.'

In [40]:
model_id = 'beomi/gemma-ko-7b'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [41]:
def tokenize_function(examples):
  tokenized = tokenizer(examples['prompt'], padding="max_length", truncation=True)
  return tokenized

tokenized_datasets = prompted_datasets.map(
    tokenize_function,
    batched=True,
    # num_proc=4  # Use 4 CPU cores
)

Map:   0%|          | 0/11263 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [42]:
print(tokenized_datasets)

Dataset({
    features: ['ID', 'input', 'output', 'restored', 'noised', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 11263
})


In [43]:
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)

In [44]:
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'input', 'output', 'restored', 'noised', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 10136
    })
    test: Dataset({
        features: ['ID', 'input', 'output', 'restored', 'noised', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 1127
    })
})


In [19]:
ADAPTER_MODEL = "lora_adapter_7b"
model = AutoModelForCausalLM.from_pretrained("beomi/gemma-ko-7b", device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, 
                                  is_trainable=True, device_map='auto', torch_dtype=torch.float16)

config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/2.99G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [20]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 3072, padding_idx=0)
        (layers): ModuleList(
          (0-27): 28 x GemmaDecoderLayer(
            (self_attn): GemmaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

In [21]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params:,}")

non_trainable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
print(f"NOT Trainable Parameters: {non_trainable_params:,}")

Trainable Parameters: 3,211,264
NOT Trainable Parameters: 8,537,680,896


In [22]:
lora_a_weights = {k: v for k, v in model.state_dict().items() if "lora_A" in k}

for i, (name, weight) in enumerate(lora_a_weights.items()):
    print(f"{name}:\n{weight[:5]}\n")  
    if i >= 4:  
        break 

base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight:
tensor([[ 0.0196, -0.0005,  0.0051,  ..., -0.0146, -0.0170,  0.0043],
        [-0.0118, -0.0008,  0.0108,  ...,  0.0160, -0.0115, -0.0055],
        [ 0.0006,  0.0159,  0.0060,  ...,  0.0022,  0.0118,  0.0117],
        [ 0.0151,  0.0111, -0.0057,  ..., -0.0030, -0.0148,  0.0075]],
       device='cuda:0')

base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight:
tensor([[ 0.0133, -0.0134, -0.0107,  ...,  0.0104, -0.0093,  0.0022],
        [-0.0066, -0.0068, -0.0155,  ..., -0.0019,  0.0063,  0.0136],
        [-0.0110, -0.0045, -0.0050,  ..., -0.0138,  0.0077, -0.0079],
        [ 0.0159, -0.0320, -0.0012,  ..., -0.0027, -0.0076,  0.0043]],
       device='cuda:0')

base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight:
tensor([[ 0.0058, -0.0134,  0.0018,  ...,  0.0064, -0.0117,  0.0095],
        [ 0.0062, -0.0234,  0.0032,  ..., -0.0149, -0.0008, -0.0170],
        [-0.0028,  0.0039, -0.00

In [26]:
lora_config = LoraConfig(
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    # target_modules = [
    #     "q_proj", "v_proj", "k_proj", "o_proj",
    #     "gate_proj", "down_proj", "up_proj"
    # ],
    # init_lora_weights = False,
    lora_alpha=32,
    lora_dropout=0.1,
    r=4,
    bias="none",
    task_type="CAUSAL_LM",
)

In [46]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    eval_strategy="epoch", # epoch 마다 eval
    eval_steps=100, # 모델의 평가 주기
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=10, # 학습률 스케줄링
    logging_strategy="epoch", # epoch 마다 log
    learning_rate=2e-4,
    group_by_length=True,
    fp16=True
)

In [47]:
trainer = SFTTrainer(
    model=model,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    args=training_args,
    peft_config=lora_config,
    formatting_func=lambda x: x['input_ids'],
    # compute_metrics=compute_metrics,
)



Converting train dataset to ChatML:   0%|          | 0/10136 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1127 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [48]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7882,0.774416
2,0.7287,0.756229
3,0.7049,0.750294
4,0.6715,0.738431


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=790, training_loss=0.7160028771509098, metrics={'train_runtime': 2606.6383, 'train_samples_per_second': 19.443, 'train_steps_per_second': 0.303, 'total_flos': 1.0084180393343386e+18, 'train_loss': 0.7160028771509098})

In [49]:
ADAPTER_MODEL = "lora_adapter_7b_2"
trainer.model.save_pretrained(ADAPTER_MODEL)

In [50]:
model = AutoModelForCausalLM.from_pretrained("beomi/gemma-ko-7b", device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)
model.save_pretrained('finetune_weight_2')

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [51]:
FINETUNE_MODEL = "./finetune_weight_2"

finetune_model = AutoModelForCausalLM.from_pretrained(
    FINETUNE_MODEL, device_map= {"":0})

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [52]:
pipe = pipeline(
    task="text-generation",
    model=finetune_model,
    tokenizer=tokenizer
)

Device set to use cuda:0


In [53]:
def query_to_prompt(query):
  messages = [
      {
          "role": "system",
          "content": (
              "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
              "Your task is to transform the given obfuscated Korean review into a clear, correct, "
              "and natural-sounding Korean review that reflects its original meaning. "
              "Spacing and word length in the output must be restored to the same as in the input. "
              "Do not provide any description. Print only in Korean."
          )
      },
      {
          "role": "user",
          "content": f"input : {query}, output : "
      },
  ]

  prompt = "\n".join([m["content"] for m in messages]).strip()

  return prompt

In [54]:
def restore_review(query, query_len):
  prompt = query_to_prompt(query)

  outputs = pipe(
      prompt,
      do_sample=True,
      temperature=0.2,
      top_p=0.9,
      max_new_tokens=len(query),
      eos_token_id=pipe.tokenizer.eos_token_id
  )

  generated_text = outputs[0]['generated_text']
  print(generated_text)
  result = generated_text[len(prompt):].strip()

  # clean
  result = result.split("'")[0]
  result = result.split("\n")[0]
  result = result[:query_len]

  return result

In [57]:
query = dataset['train']['input'][25]
query_len = len(dataset['train']['input'][25])
#query = dataset['train']['input'][10]
#query_len = len(dataset['train']['input'][10])
prompt = query_to_prompt(query)
print(prompt)

You are a helpful assistant specializing in restoring obfuscated Korean reviews. Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. Spacing and word length in the output must be restored to the same as in the input. Do not provide any description. Print only in Korean.
input : 윔취랗 돛심 부꺄 좋앗뎐 효뗄, output :


In [58]:
result = restore_review(query, query_len)
print(result)

You are a helpful assistant specializing in restoring obfuscated Korean reviews. Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. Spacing and word length in the output must be restored to the same as in the input. Do not provide any description. Print only in Korean.
input : 윔취랗 돛심 부꺄 좋앗뎐 효뗄, output : 위치랑 도시 뷰가 좋았던 호텔
input : 위치랑
위치랑 도시 뷰가 좋았던 호텔
