<a href="https://colab.research.google.com/github/dbwofla11/DaconBaseLine_LLM-gemma-7b-/blob/master/TransFormer_Base1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade transformers
!pip install -U bitsandbytes
!pip install datasets
!pip install peft
!pip install wandb




In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline , Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig

In [3]:
# prompt: 구글 드라이브 마운팅

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:


# 데이터 경로 설정 (DaconData 폴더 내부의 파일명으로 수정)
train_path = '/content/drive/MyDrive/Colab Notebooks/DaconData/train.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/DaconData/test.csv'


try:
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    print("Train data shape:", train.shape)
    print("Test data shape:", test.shape)
    samples = []

    for i in range(10):
        sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
        samples.append(sample)


except FileNotFoundError:
    print(f"Error: Train or Test file not found at specified paths.")
    print(f"Train Path: {train_path}")
    print(f"Test Path: {test_path}")
except Exception as e:
    print(f"An error occurred: {e}")




Train data shape: (11263, 3)
Test data shape: (1689, 2)


In [5]:
model_id = 'beomi/gemma-ko-7b'  # 모델 ID
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",  # 자동으로 장치 맵핑
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print("모델이 성공적으로 로드되었습니다!")

    # PEFT 어댑터 설정 (LoraConfig)
    lora_config = LoraConfig(
        r=8,  # 어댑터 크기
        lora_alpha=32,  # 학습률 스케일
        lora_dropout=0.1,  # 드롭아웃 비율
    )

    # PEFT 모델로 변환
    peft_model = get_peft_model(model, lora_config)
    print("PEFT 모델로 변환 완료!")

except KeyError as e:
    print(f"KeyError 발생: {e}")
    print("모델 ID 또는 클래스 호환성을 확인하세요.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

모델이 성공적으로 로드되었습니다!
PEFT 모델로 변환 완료!


In [6]:
from datasets import Dataset

def tokenize_function(examples):
    return tokenizer(examples['input'], padding="max_length", truncation=True, max_length=128)

# Hugging Face의 Dataset 객체로 변환
train_data = [{"input": row['input'], "output": row['output']} for _, row in train.iterrows()]
train_dataset = Dataset.from_list(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/11263 [00:00<?, ? examples/s]

In [12]:
import wandb

# API 키를 입력하여 로그인
wandb.login(key="6778ce23e83d79821865aa382f32b3d94bd774b5")
# WandB 프로젝트 초기화
wandb.init(project="your_project_name", name="experiment_name")

# 학습 인자 설정
training_args = TrainingArguments(
    output_dir="./results",  # 결과 디렉토리
    run_name="my_experiment",
    eval_strategy="no",  # 평가 비활성화
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
)

# Trainer 설정
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,  # 올바른 매개변수
)

# 학습 수행
trainer.train()


# 모델 저장
model_dir = './fine_tuned_model_baseGemma7'
peft_model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# WandB에 모델 저장
wandb.save(f"{model_dir}/*")


  trainer = Trainer(


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer
)

restored_reviews = []


for index, row in test.iterrows():
    query = row['input']

    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
                "Your task is to transform the given obfuscated Korean review into a clear, correct, "
                "and natural-sounding Korean review that reflects its original meaning. "
                "Below are examples of obfuscated Korean reviews and their restored forms:\n\n"
                f"Example, {samples}"
                "Spacing and word length in the output must be restored to the same as in the input. "
                "Do not provide any description. Print only in Korean."
            )
        },
        {
            "role": "user",
            "content": f"input : {query}, output : "
        },
    ]

    prompt = "\n".join([m["content"] for m in messages]).strip()


    outputs = pipe(
        prompt,
        do_sample=True,
        temperature=0.2,
        top_p=0.9,
        max_new_tokens=len(query),
        eos_token_id=pipe.tokenizer.eos_token_id
    )

    generated_text = outputs[0]['generated_text']
    result = generated_text[len(prompt):].strip()


    restored_reviews.append(result)

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DaconData/sample_submission.csv', encoding = 'utf-8-sig')
submission['output'] = converted_reviews
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')

submission.head()