In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import json
import csv

In [2]:
device = torch.device("cuda")

In [3]:
data = pd.read_csv('../datasets/novels/novel_data1.csv')

In [None]:
print(data.head())

In [5]:
# 1. 데이터 로드
train = pd.read_csv('../datasets/sentiment_conversation/train.csv')
val = pd.read_csv('../datasets/sentiment_conversation/val.csv')

In [None]:
print(train["sentiment"].value_counts())
print(val["sentiment"].value_counts())

In [8]:
# 2. KLUE/roberta-base 토크나이저 적용
model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
# 3. Hugging Face Dataset 변환
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)

# 사용할 데이터셋 크기 설정 (임시)
train_dataset = train_dataset.select(range(50000))
val_dataset = val_dataset.select(range(7000))

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset = train_dataset.rename_column("sentiment", "labels")
val_dataset = val_dataset.rename_column("sentiment", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")

In [None]:
# 4. 모델 정의
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(train["sentiment"].unique()))
model.to(device)

In [None]:
# 5. TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True
)

In [12]:
# 6. Trainer 정의 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # 소규모 데이터셋 사용
    eval_dataset=val_dataset,  # 소규모 검증 데이터 사용
)

In [None]:
trainer.train()

In [None]:
model.eval()

In [16]:
sentiment_mapping = {
    0: '기쁨',
    1: '슬픔',
    2: '분노',
    3: '상처',
    4: '불안',
    5: '당황',
}

In [None]:
for index, row in data.iterrows():
    inputs = tokenizer(str(row['text']), return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        scores = torch.softmax(outputs.logits, dim=1)
        predicted_label = torch.argmax(scores, dim=1).item()
    
    data.at[index, "labels"] = predicted_label

    predicted_sentiment = sentiment_mapping.get(predicted_label, "Unknown")
    data.at[index, "sentiment"] = predicted_sentiment
    
    # print(f"Text: {row['text']}")
    # print(f"Scores: {scores.tolist()}")
    # print(f"Predicted label: {predicted_label}")
    # print(f"Predicted sentiment: {predicted_sentiment}\n")
    print(f"Index: {index}")

In [None]:
# 결과 출력
output_file = './results/classified_novel1.csv'

# CSV 파일로 저장
with open(output_file, 'w', encoding='utf-8', newline='') as csv_f:
    writer = csv.writer(csv_f)
    
    # 컬럼명 작성
    writer.writerow(data.columns)
    
    # 데이터 행 작성
    for row in data.itertuples(index=False):
        writer.writerow(row)