# Installation

In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl
%pip install -U datasets
%pip install -U kaggle

In [2]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
import os
from google.colab import userdata

# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.

os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
BASE_MODEL = "google/gemma-2b-it"

# Load Dataset


In [12]:
from datasets import load_dataset

ds = load_dataset("jugg1024/pokemon-gpt4o-captions", split='train')
ds = ds.with_format("np", columns="en_text", output_all_columns=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/355 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/341M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/833 [00:00<?, ? examples/s]

In [13]:
import re

def preprocess_and_extract_name(data):
    """
    포켓몬 설명에서 포켓몬 이름을 찾아 'its'로 치환하는 함수.

    Args:
        data (str): 포켓몬 이름: 설명 형식의 문자열 데이터.
    Returns:
        str: 포켓몬 이름을 'its'로 치환한 문자열 데이터.
    """
    # 데이터에서 포켓몬 이름과 설명을 분리 (이름과 설명이 ':'로 구분되어 있다고 가정)
    pokemon_name, explanation = data.split(':', 1)

    # 포켓몬 이름을 대소문자 구분 없이 찾아서 'its'로 치환
    pattern = re.compile(re.escape(pokemon_name), re.IGNORECASE)
    processed_explanation = pattern.sub('it', explanation)

    return pokemon_name, processed_explanation

In [14]:
ds = ds.map(lambda x: {
    'pokemon_name': preprocess_and_extract_name(x['en_text'])[0],  # 이름
    'en_text': preprocess_and_extract_name(x['en_text'])[1],  # 설명
})

Map:   0%|          | 0/833 [00:00<?, ? examples/s]

# Load model

In [15]:
# Load base model(Gemma 2B-it)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [16]:
doc = ds[0]['en_text']

In [17]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

In [18]:
messages = [
    {
        "role": "user",
        "content": "다음 특성을 살린 포켓몬식 이름을 지어줘:\n\n{}".format(doc)
    }
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [19]:
prompt

'<bos><start_of_turn>user\n다음 특성을 살린 포켓몬식 이름을 지어줘:\n\n A small, quadruped creature with a blue-green body, sharp triangular eyes with red irises, and noticeable dark patches on its skin. It has a plant bulb on its back, which is thick and green, signifying its Grass/Poison typing. The bulb is prominent and resembles a small cabbage or plant bud. The creature has pointed, stubby legs with claws and an ear-like protrusion on each side of its head.<end_of_turn>\n<start_of_turn>model\n'

In [20]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [21]:
print(outputs[0]["generated_text"][len(prompt):])

Sure, here's a few names that fit the description:

1. Sprigatito
2. Petalpuff
3. Leafbert
4. Bonsai
5. Thymeleaf


In [22]:
def generate_prompt(example):
    prompt_list = []
    for i in range(len(example['en_text'])):
        prompt_list.append(r"""<bos>다음 특성을 살린 포켓몬식 이름을 지어줘:\n\n{}<end_of_turn>
<start_of_turn>model 포켓몬식 이름:
{}<end_of_turn><eos>""".format(example['en_text'][i], example['pokemon_name'][i]))
    return prompt_list

In [23]:
train_data = ds

## LoRA

In [24]:
lora_config = LoraConfig(
    r=6,
    lora_alpha = 8,
    lora_dropout = 0.05,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [25]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.padding_side = 'right'

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Training

In [26]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="/content/drive/MyDrive/gemma/2b-it-check",
        num_train_epochs = 8,
        max_steps=3000,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=0,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_prompt,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/833 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [27]:
trainer.train()

Step,Training Loss
100,2.1406
200,1.7021
300,1.524
400,1.5106
500,1.2999
600,1.2993
700,1.0774
800,1.0243
900,0.8288
1000,0.7411


TrainOutput(global_step=3000, training_loss=0.5613124100367228, metrics={'train_runtime': 3495.6152, 'train_samples_per_second': 3.433, 'train_steps_per_second': 0.858, 'total_flos': 2.272712869827379e+16, 'train_loss': 0.5613124100367228, 'epoch': 14.40576230492197})

# Save fine-tuned model

In [28]:
ADAPTER_MODEL = "/content/drive/MyDrive/gemma/2b-it-LoRA"

trainer.model.save_pretrained(ADAPTER_MODEL)

In [29]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)

model = model.merge_and_unload()
model.save_pretrained('/content/drive/MyDrive/gemma/2b-it-poke-agg')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Saving checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Inference


In [None]:
BASE_MODEL = "google/gemma-2b-it"
FINETUNE_MODEL = "/content/drive/MyDrive/gemma/2b-it-poke-agg"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

In [34]:
doc = ds[0]['en_text']

In [35]:
pipe_finetuned = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

In [36]:
messages = [
    {
        "role": "user",
        "content": "다음 특성을 살린 포켓몬식 이름을 지어줘:\n\n{}".format(doc)
    }
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [37]:
prompt

'<bos><start_of_turn>user\n다음 특성을 살린 포켓몬식 이름을 지어줘:\n\n A small, quadruped creature with a blue-green body, sharp triangular eyes with red irises, and noticeable dark patches on its skin. It has a plant bulb on its back, which is thick and green, signifying its Grass/Poison typing. The bulb is prominent and resembles a small cabbage or plant bud. The creature has pointed, stubby legs with claws and an ear-like protrusion on each side of its head.<end_of_turn>\n<start_of_turn>model\n'

In [38]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)

In [41]:
print(outputs[0]["generated_text"][len(prompt):])

Sure, here's a few names that fit the description you provided:

1. Grasspuff
2. Sprigatito
3. Petalpuff
4. Leafie
5. Blosspik
