# 베이스라인

In [1]:
# 필요한 라이브러리 설치
!pip install transformers tqdm pandas numpy torch sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.3.1


In [2]:
# 필요한 라이브러리를 임포트
import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [3]:
# Google Drive를 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# 파일 경로 설정
file_path = '/content/drive/My Drive/한솔데코/train.csv'

# pandas를 사용하여 데이터 파일 불러오기
data = pd.read_csv(file_path)


In [6]:
# 토크나이저 로드 (여기서는 GPT-2의 토크나이저를 사용했으나, 사용하는 모델에 맞게 변경해야 합니다.)
tokenizer = GPT2Tokenizer.from_pretrained('skt/kogpt2-base-v2', eos_token='</s>')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

323it [00:01, 267.20it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors
644it [00:02, 248.67it/s]

Done.





In [7]:
# 모델 로드 및 설정 (모델 이름 변경 필요)
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.to(device)

CFG = {
    'LR': 2e-5,
    'EPOCHS': 10,
}

optimizer = AdamW(model.parameters(), lr=CFG['LR'])
model.train()

for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 및 토크나이저 저장 (경로 변경 필요)
save_directory = "/content/drive/My Drive/한솔데코/kogpt2_updated"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/6440 [00:00<?, ?it/s]


TypeError: RobertaForQuestionAnswering.forward() got an unexpected keyword argument 'labels'

In [None]:
# 모델 및 토크나이저 로드 (경로 변경 필요)
model_dir = "/content/drive/My Drive/한솔데코/hansoldeco-kogpt2"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

test = pd.read_csv('/content/drive/My Drive/한솔데코/test.csv')
preds = []

for test_question in tqdm(test['질문']):
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip().replace('\n', ' ')
        preds.append(answer_only)

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
pred_embeddings = model.encode(preds)
print(pred_embeddings.shape)

submit = pd.read_csv('/content/drive/My Drive/한솔데코/sample_submission.csv')
submit.iloc[:,1:] = pred_embeddings
print(submit.head())
submit.to_csv('/content/drive/My Drive/한솔데코/code_submit.csv', index=False)
