# 베이스라인

In [None]:
# 필요한 라이브러리 설치
!pip install transformers tqdm pandas numpy torch sentence_transformers

In [None]:
# 필요한 라이브러리를 임포트
import pandas as pd
import numpy as np
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
# Google Drive를 마운트
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# 파일 경로 설정
file_path = '/content/drive/My Drive/한솔데코/train.csv'

# pandas를 사용하여 데이터 파일 불러오기
data = pd.read_csv(file_path)


In [None]:
# 토크나이저 로드
tokenizer = RobertaTokenizer.from_pretrained('klue/roberta-large')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)

In [None]:
# 모델 로드 및 설정 (모델 구조 변경에 따른 코드 조정 필요)
model = RobertaForSequenceClassification.from_pretrained('klue/roberta-large')
model.to(device)

# 학습 설정
CFG = {
    'LR': 2e-5,
    'EPOCHS': 10,
}

optimizer = AdamW(model.parameters(), lr=CFG['LR'])
model.train()

# 학습 과정 (태스크에 따라 변경될 수 있음)
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        batch = batch.to(device)
        # outputs = model(...) # 태스크에 맞게 수정 필요
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 및 토크나이저 저장
model.save_pretrained("/content/drive/My Drive/한솔데코/hansoldeco-klue/roberta-large")
tokenizer.save_pretrained("/content/drive/My Drive/한솔데코/hansoldeco-klue/roberta-large")


In [None]:
# 모델 및 토크나이저 로드 (경로 변경 필요)
model_dir = "/content/drive/My Drive/한솔데코/hansoldeco-klue/roberta-large"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

test = pd.read_csv('/content/drive/My Drive/한솔데코/test.csv')
preds = []

for test_question in tqdm(test['질문']):
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip().replace('\n', ' ')
        preds.append(answer_only)

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
pred_embeddings = model.encode(preds)
print(pred_embeddings.shape)

submit = pd.read_csv('/content/drive/My Drive/한솔데코/sample_submission.csv')
submit.iloc[:,1:] = pred_embeddings
print(submit.head())
submit.to_csv('/content/drive/My Drive/한솔데코/code_submit.csv', index=False)
