<a href="https://colab.research.google.com/github/dbwofla11/DaconBaseLine_LLM-gemma-7b-/blob/master/TransFormer_Base1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install --upgrade transformers
!pip install -U bitsandbytes
!pip install datasets
!pip install peft




In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline , Trainer, TrainingArguments

In [3]:
# prompt: 구글 드라이브 마운팅

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:


# 데이터 경로 설정 (DaconData 폴더 내부의 파일명으로 수정)
train_path = '/content/drive/MyDrive/Colab Notebooks/DaconData/train.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/DaconData/test.csv'


try:
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    print("Train data shape:", train.shape)
    print("Test data shape:", test.shape)
    samples = []

    for i in range(10):
        sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
        samples.append(sample)


except FileNotFoundError:
    print(f"Error: Train or Test file not found at specified paths.")
    print(f"Train Path: {train_path}")
    print(f"Test Path: {test_path}")
except Exception as e:
    print(f"An error occurred: {e}")




Train data shape: (11263, 3)
Test data shape: (1689, 2)


In [5]:
model_id = 'beomi/gemma-ko-7b'  # 모델 ID
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",  # 자동으로 장치 맵핑
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print("모델이 성공적으로 로드되었습니다!")
except KeyError as e:
    print(f"KeyError 발생: {e}")
    print("모델 ID 또는 클래스 호환성을 확인하세요.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

모델이 성공적으로 로드되었습니다!


In [8]:
from datasets import Dataset

def tokenize_function(examples):
    return tokenizer(examples['input'], padding="max_length", truncation=True, max_length=128)

# Hugging Face의 Dataset 객체로 변환
train_data = [{"input": row['input'], "output": row['output']} for _, row in train.iterrows()]
train_dataset = Dataset.from_list(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/11263 [00:00<?, ? examples/s]

In [11]:
# 1) Fine-tuning을 위한 Trainer 설정
training_args = TrainingArguments(
    output_dir="./results",            # 모델 저장 디렉터리
    evaluation_strategy="no",       # 평가 주기 (여기서는 에폭마다 평가)
    learning_rate=2e-5,                # 학습률
    per_device_train_batch_size=8,     # 배치 크기
    num_train_epochs=3,                # 학습 에폭 수
    weight_decay=0.01,                 # 가중치 감소 (L2 정규화)
    save_steps=10_000,                 # 모델 저장 주기
    logging_dir='./logs',              # 로그 디렉터리
    logging_steps=200,                 # 로그 주기
)

trainer = Trainer(
    model=model,                      # fine-tuning할 모델
    args=training_args,               # 학습 하이퍼파라미터
    train_dataset=train_dataset,      # 훈련 데이터셋
    tokenizer=tokenizer,              # 토크나이저
)

# 학습 수행
trainer.train()

# 모델 저장
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

  trainer = Trainer(


ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer
)

restored_reviews = []


for index, row in test.iterrows():
    query = row['input']

    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
                "Your task is to transform the given obfuscated Korean review into a clear, correct, "
                "and natural-sounding Korean review that reflects its original meaning. "
                "Below are examples of obfuscated Korean reviews and their restored forms:\n\n"
                f"Example, {samples}"
                "Spacing and word length in the output must be restored to the same as in the input. "
                "Do not provide any description. Print only in Korean."
            )
        },
        {
            "role": "user",
            "content": f"input : {query}, output : "
        },
    ]

    prompt = "\n".join([m["content"] for m in messages]).strip()


    outputs = pipe(
        prompt,
        do_sample=True,
        temperature=0.2,
        top_p=0.9,
        max_new_tokens=len(query),
        eos_token_id=pipe.tokenizer.eos_token_id
    )

    generated_text = outputs[0]['generated_text']
    result = generated_text[len(prompt):].strip()


    restored_reviews.append(result)

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DaconData/sample_submission.csv', encoding = 'utf-8-sig')
submission['output'] = converted_reviews
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')

submission.head()