In [None]:
# import pandas as pd


# original_path = "merged_emotion_dataset.csv"
# val_path = "감성대화말뭉치(최종데이터)_Validation.xlsx"


# original_df = pd.read_csv(original_path)
# original_df = original_df[['Sentence', 'Emotion']].dropna()


# val_df = pd.read_excel(val_path)


# emotion_mapping = {
#     "상처": "슬픔",
#     "슬픔": "슬픔",
#     "불안": "공포",
#     "당황": "놀람",
#     "기쁨": "행복",
#     "분노": "분노"
# }

# processed_rows = []
# for _, row in val_df.iterrows():
#     original_emotion = row['감정_대분류']
#     mapped_emotion = emotion_mapping.get(original_emotion.strip(), None)
#     if mapped_emotion:
#         for i in range(1, 4):
#             sentence = row.get(f'사람문장{i}', None)
#             if pd.notna(sentence):
#                 processed_rows.append({
#                     "Sentence": sentence.strip(),
#                     "Emotion": mapped_emotion
#                 })


# val_processed_df = pd.DataFrame(processed_rows)


# merged_df = pd.concat([original_df, val_processed_df], ignore_index=True)


# emotion_counts = merged_df["Emotion"].value_counts()
# print(emotion_counts)

# merged_df.to_csv("merged_emotion_dataset_최종.csv", index=False, encoding="utf-8-sig")



In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from sklearn.metrics import accuracy_score
import torch
import numpy as np

train_path = "./감성대화말뭉치(최종데이터)_Training.xlsx"
val_path="./감성대화말뭉치(최종데이터)_Validation.xlsx"

# 감정 매핑
emotion_mapping = {
    "상처": "슬픔",
    "슬픔": "슬픔",
    "불안": "불안",
    "당황": "놀람",
    "기쁨": "행복",
    "분노": "분노"
}

def preprocess_emotion_data(df):
    processed_rows = []
    for _, row in df.iterrows():
        original_emotion = row["감정_대분류"]
        mapped_emotion = emotion_mapping.get(str(original_emotion).strip(), None)
        if mapped_emotion:
            for i in range(1, 4):
                sentence = row.get(f"사람문장{i}", None)
                if pd.notna(sentence):
                    processed_rows.append({
                        "Sentence": str(sentence).strip(),
                        "Emotion": mapped_emotion
                    })
    return pd.DataFrame(processed_rows)

train_df = preprocess_emotion_data(pd.read_excel(train_path))
val_df = preprocess_emotion_data(pd.read_excel(val_path))



# 라벨 인코딩
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["Emotion"])
val_df["label"] = label_encoder.transform(val_df["Emotion"])

emotion_labels = label_encoder.classes_.tolist()  

print(train_df.sample(n=5, random_state=10))
print(val_df.sample(n=5, random_state=10))

print(len(train_df))
print(len(val_df))

  from .autonotebook import tqdm as notebook_tqdm
  warn(


                                      Sentence Emotion  label
113114                      운전을 못 하게 되었어. 우울해.      불안      2
113361  모아든 돈도 없고 남들처럼 개인연금 하나 들지 않은 것이 후회가 돼.      불안      2
59602     어 이전에도 비슷한 일이 있었지. 여자 친구와 대화를 해봐야겠다.      놀람      0
54829    같이 가고 싶다고 사실대로 말하는 것이 가장 좋은 방법인 것 같아.      슬픔      3
98135            내가 잘하는 걸 하는 게 나에게 더 행복할 거 같아!      불안      2
                                                Sentence Emotion  label
11449  아무래도 없는 사람에 대해서 욕하는 모습을 보면 나에 대해서도 말할 거라고 생각이 ...      슬픔      3
2382          결혼기념일에 특별히 아내에게 대접하려고 예약한 레스토랑이 생각보다 별로였어.      슬픔      3
9740                      나 혼자라도 마스크 쓰고 다니면서 조심해야지 어쩌겠어.      분노      1
10157                           아무래도 오늘 밤에 아빠랑 얘길 해봐야겠어.      분노      1
13757          주식에 투자한 것도 전망이 좋으니 남은 여생은 전원생활을 즐기며 살까 해.      행복      4
145955
17968


In [None]:
train_dataset = Dataset.from_pandas(train_df[["Sentence", "label"]])
val_dataset = Dataset.from_pandas(val_df[["Sentence", "label"]])


# model_name = "monologg/koelectra-base-v3-discriminator"
# model_name = "beomi/KcELECTRA-base"
model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization
def tokenize_fn(example):
    return tokenizer(example["Sentence"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_fn)
val_dataset = val_dataset.map(tokenize_fn)



model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotion_labels)
)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



training_args = TrainingArguments(
    output_dir="./results_RoBerta_base_20epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=20,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=2e-5,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


Map: 100%|██████████| 145955/145955 [00:27<00:00, 5276.53 examples/s]
Map: 100%|██████████| 17968/17968 [00:03<00:00, 5139.32 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

from transformers import TrainerCallback
import csv


accuracy_log = []

class AccuracyLoggerCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics and "eval_accuracy" in metrics:
            epoch = int(state.epoch)
            accuracy = metrics["eval_accuracy"]
            accuracy_log.append((epoch, accuracy))
            print(f"[📊 로그] Epoch {epoch} - Accuracy: {accuracy:.4f}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[AccuracyLoggerCallback()]  
)

# Trainer을 통해 checkpoint부터 다시 실행
trainer.train()
# trainer.train(resume_from_checkpoint = "./results_kcelectra_감성둘다_20epoch/checkpoint-22820")


train_metrics = trainer.evaluate(eval_dataset=train_dataset)
for k, v in train_metrics.items():
    print(f"{k}: {v:.4f}")

with open("accuracy_log.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["epoch", "accuracy"])
    writer.writerows(accuracy_log)

print("로그가 'accuracy_log.csv'로 저장")

def predict_emotion(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1).squeeze().cpu().numpy() 
    
    print(f"\n입력 문장: {text}")
    print("감정별 확률:")
    for label, prob in zip(emotion_labels, probs):
        print(f"  {label}: {prob.item():.4f}") 

    pred = emotion_labels[np.argmax(probs)]  

    print(f"\n예측된 감정: {pred}")

# 테스트 예시
predict_emotion("기분이 너무 안 좋아요. 아무것도 하기 싫어요.")


  trainer = Trainer(
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight'

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
def predict_emotion(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1).squeeze().cpu().numpy() 
    
    print(f"\n 입력 문장: {text}")
    print(" 감정별 확률:")
    for label, prob in zip(emotion_labels, probs):
        print(f"  {label}: {prob.item():.4f}")  

    pred = emotion_labels[np.argmax(probs)] 

    print(f"\n 예측된 감정: {pred}")

# 테스트 예시
predict_emotion("흔들리는 나를 왜 모르시나요요")


📝 입력 문장: 흔들리는 나를 왜 모르시나요요
📊 감정별 확률:
  놀람: 0.0258
  분노: 0.1724
  불안: 0.0018
  슬픔: 0.7999
  행복: 0.0001

🟢 예측된 감정: 슬픔


In [None]:
import os
import json
import shutil

# 체크포인트 디렉토리 기준 경로
checkpoint_root = "./results_RoBerta_base_20epoch"
best_model_dir = "./best_model_RoBerta_base_20epoch"
best_acc = -1.0
best_checkpoint_path = None

# 모든 checkpoint 디렉토리 순회
for subdir in os.listdir(checkpoint_root):
    path = os.path.join(checkpoint_root, subdir)
    if subdir.startswith("checkpoint") and os.path.isdir(path):
        trainer_state_path = os.path.join(path, "trainer_state.json")
        if os.path.exists(trainer_state_path):
            with open(trainer_state_path, "r", encoding="utf-8") as f:
                state = json.load(f)
                log_history = state.get("log_history", [])
              
                for log in reversed(log_history):
                    if "eval_accuracy" in log:
                        acc = log["eval_accuracy"]
                        if acc > best_acc:
                            best_acc = acc
                            best_checkpoint_path = path
                        break

# 가장 정확도 높은 checkpoint 복사
if best_checkpoint_path:
    if os.path.exists(best_model_dir):
        shutil.rmtree(best_model_dir)
    shutil.copytree(best_checkpoint_path, best_model_dir)
    print(f"가장 정확도 높은 모델을 복사: {best_checkpoint_path}")
    print(f"정확도: {best_acc:.4f}")
else:
    print(" 적절한 checkpoint가 없음음")


✅ 가장 정확도 높은 모델을 복사했습니다: ./results_koelectra_감성둘다_20epoch/checkpoint-18256
➡️  정확도: 0.6913


In [None]:
import json
import matplotlib.pyplot as plt


trainer_state_path = "./results_kcelectra_2/checkpoint-22820/trainer_state.json"


with open(trainer_state_path, "r", encoding="utf-8") as f:
    state = json.load(f)

logs = state["log_history"]


train_loss = []
eval_loss = []
eval_accuracy = []
epochs = []

for log in logs:
    if "loss" in log and "epoch" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log and "epoch" in log:
        eval_loss.append(log["eval_loss"])
        eval_accuracy.append(log["eval_accuracy"])
        epochs.append(log["epoch"])


plt.figure(figsize=(12, 6))

# 1. Loss
plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_loss)+1), train_loss, label="Train Loss", marker='o')
plt.plot(epochs, eval_loss, label="Validation Loss", marker='o')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Validation Loss")
plt.legend()
plt.grid(True)

# 2. Accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, eval_accuracy, label="Validation Accuracy", marker='o', color='green')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: './results_kcelectra_2/checkpoint-22820/trainer_state.json'