In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [None]:
# 1. 엑셀 데이터 불러오기
df = pd.read_excel("/Users/cscs0829/Downloads/단발성.xlsx")  # 엑셀 파일 경로

df = df[['document', 'label']].dropna()  # 필요 컬럼만 선택 및 결측값 제거

In [None]:
# 감정 라벨 매핑 (텍스트 -> 숫자)
label_map = {'놀람': 0, '공포': 1, '분노': 2, '슬픔': 3, '중립': 4, '행복': 5, '혐오': 6}
df['label'] = df['label'].map(label_map)


In [None]:
# 2. 데이터 분할 (7:3 비율)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['document'].tolist(), df['label'].tolist(), test_size=0.3, random_state=42
)

In [None]:
# 3. 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("beomi/kcbert-base")  # kcBERT 사용 (경로 변경 가능)


# 데이터셋 클래스 정의
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# 4. 데이터셋 준비
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

In [None]:
# 5. 토크나이저 함수
def tokenize_function(examples):
    return tokenizer(examples["document"], padding="max_length", truncation=True)

# 토큰화 적용
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
# 6. 모델 설정
model = BertForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels=7)

In [None]:
# 7. 학습 설정
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    save_total_limit=2
)

# 8. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
# 9. 모델 학습
trainer.train()

In [None]:
# 10. 모델 저장
model.save_pretrained("/Users/cscs0829/Downloads/kcbert_sentiment")
tokenizer.save_pretrained("/Users/cscs0829/Downloads/kcbert_sentiment")

In [None]:
# 11. 모델 평가
eval_results = trainer.evaluate()
print(eval_results)