In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda")

In [10]:
json_data_path = '../datasets/gpt_story.json'

with open(json_data_path, "r", encoding="utf-8") as f:
    data = json.load(f)

In [13]:
print(data[3])

{'id': 4, 'content': "산길을 걷던 남자는 우연히 떨어진 그림엽서를 발견했다. 그림 속에는 자신이 어릴 적 갔던 바다가 그려져 있었다. 뒷면에는 낯익은 필체로 '다시 만나자'라는 문구가 적혀 있었다. 누군가의 초대 같았다."}


In [14]:
# 1. 데이터 로드
train = pd.read_csv('../datasets/sentiment_conversation/train.csv')
val = pd.read_csv('../datasets/sentiment_conversation/val.csv')

In [15]:
print(train["sentiment"].value_counts())

sentiment
4    26301
2    26000
3    25957
1    25814
5    24936
0    16947
Name: count, dtype: int64


In [16]:
# 2. KLUE/roberta-base 토크나이저 적용
model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [17]:
# 3. Hugging Face Dataset 변환
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)

# 추가
train_dataset = train_dataset.select(range(3000))
val_dataset = val_dataset.select(range(500))
# 나중에 삭제 필요

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset = train_dataset.rename_column("sentiment", "labels")
val_dataset = val_dataset.rename_column("sentiment", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")

Map: 100%|██████████| 3000/3000 [00:00<00:00, 18925.86 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 18400.59 examples/s]


In [18]:
# 4. 모델 정의
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(train["sentiment"].unique()))
model.to(device)

  state_dict = torch.load(resolved_archive_file, map_location="cpu")
Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initiali

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [19]:
# 5. TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True
)

In [20]:
# 6. Trainer 정의 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # 소규모 데이터셋 사용
    eval_dataset=val_dataset,  # 소규모 검증 데이터 사용
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 3000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 940
 20%|██        | 188/940 [00:33<01:56,  6.45it/s]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
                                                 
 20%|██        | 188/940 [00:34<01:56,  6.45it/s]Saving model checkpoint to /results\checkpoint-188
Configuration saved in /results\checkpoint-188\config.json


{'eval_loss': 1.1837528944015503, 'eval_runtime': 1.5255, 'eval_samples_per_second': 327.769, 'eval_steps_per_second': 20.977, 'epoch': 1.0}


Model weights saved in /results\checkpoint-188\pytorch_model.bin
Deleting older checkpoint [\results\checkpoint-376] due to args.save_total_limit
 40%|████      | 376/940 [01:08<01:26,  6.54it/s]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
                                                 
 40%|████      | 376/940 [01:10<01:26,  6.54it/s]Saving model checkpoint to /results\checkpoint-376
Configuration saved in /results\checkpoint-376\config.json


{'eval_loss': 1.1352373361587524, 'eval_runtime': 1.5327, 'eval_samples_per_second': 326.218, 'eval_steps_per_second': 20.878, 'epoch': 2.0}


Model weights saved in /results\checkpoint-376\pytorch_model.bin
Deleting older checkpoint [\results\checkpoint-564] due to args.save_total_limit
 53%|█████▎    | 501/940 [01:33<01:19,  5.49it/s]

{'loss': 1.0588, 'learning_rate': 2.340425531914894e-05, 'epoch': 2.66}


 60%|██████    | 564/940 [01:44<01:00,  6.26it/s]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
                                                 
 60%|██████    | 564/940 [01:46<01:00,  6.26it/s]Saving model checkpoint to /results\checkpoint-564
Configuration saved in /results\checkpoint-564\config.json


{'eval_loss': 1.234623670578003, 'eval_runtime': 1.5605, 'eval_samples_per_second': 320.401, 'eval_steps_per_second': 20.506, 'epoch': 3.0}


Model weights saved in /results\checkpoint-564\pytorch_model.bin
Deleting older checkpoint [\results\checkpoint-188] due to args.save_total_limit
 80%|████████  | 752/940 [02:20<00:29,  6.45it/s]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
                                                 
 80%|████████  | 752/940 [02:22<00:29,  6.45it/s]Saving model checkpoint to /results\checkpoint-752
Configuration saved in /results\checkpoint-752\config.json


{'eval_loss': 1.4105887413024902, 'eval_runtime': 1.5478, 'eval_samples_per_second': 323.031, 'eval_steps_per_second': 20.674, 'epoch': 4.0}


Model weights saved in /results\checkpoint-752\pytorch_model.bin
Deleting older checkpoint [\results\checkpoint-564] due to args.save_total_limit
100%|██████████| 940/940 [02:56<00:00,  6.32it/s]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
                                                 
100%|██████████| 940/940 [02:58<00:00,  6.32it/s]Saving model checkpoint to /results\checkpoint-940
Configuration saved in /results\checkpoint-940\config.json


{'eval_loss': 1.4811369180679321, 'eval_runtime': 1.5584, 'eval_samples_per_second': 320.835, 'eval_steps_per_second': 20.533, 'epoch': 5.0}


Model weights saved in /results\checkpoint-940\pytorch_model.bin
Deleting older checkpoint [\results\checkpoint-752] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /results\checkpoint-376 (score: 1.1352373361587524).
  state_dict = torch.load(best_model_path, map_location="cpu")
100%|██████████| 940/940 [02:59<00:00,  5.23it/s]

{'train_runtime': 179.8574, 'train_samples_per_second': 83.399, 'train_steps_per_second': 5.226, 'train_loss': 0.766505415896152, 'epoch': 5.0}





TrainOutput(global_step=940, training_loss=0.766505415896152, metrics={'train_runtime': 179.8574, 'train_samples_per_second': 83.399, 'train_steps_per_second': 5.226, 'train_loss': 0.766505415896152, 'epoch': 5.0})

In [22]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [23]:
for item in data:
    inputs = tokenizer(item['content'], return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        scores = torch.softmax(outputs.logits, dim=1)  # 감성 확률 계산
        predicted_label = torch.argmax(scores, dim=1).item()

        item['sentiment'] = predicted_label

    print(f"Text: {item['content']}")
    print(f"Scores: {scores.tolist()}")
    print(f"Predicted label: {predicted_label}\n")

Text: 별이 가득한 하늘 아래, 소년은 잃어버린 물건을 찾기 위해 숲을 헤맸다. 한참을 걸은 끝에 반짝이는 물체를 발견했는데, 그것은 오래된 나침반이었다. 나침반을 손에 쥔 순간, 이상한 일이 벌어졌다. 화살표가 빛나며 소년을 새로운 모험으로 이끌었다.
Scores: [[0.22582344710826874, 0.08359823375940323, 0.03415907174348831, 0.09653785079717636, 0.06759481132030487, 0.4922865331172943]]
Predicted label: 5

Text: 도시는 조용했다. 평소라면 차와 사람들이 가득했을 거리가 텅 비어 있었다. 한 소녀가 작은 강아지를 안고 거리를 걸으며 주변을 살폈다. 그러다 발견한 붉은 종이비행기. 비행기 안쪽에는 '희망은 어디에나 있다'라는 문구가 적혀 있었다.
Scores: [[0.339579313993454, 0.3605920374393463, 0.0220583975315094, 0.06683194637298584, 0.0857134759426117, 0.12522482872009277]]
Predicted label: 1

Text: 작은 시골 마을의 우체국에는 매주 편지가 도착했다. 하지만 이상하게도 보낸 사람은 적혀 있지 않았다. 편지는 항상 누군가에게 따뜻한 위로와 희망을 전하는 내용이었다. 사람들은 궁금해했지만, 끝내 발신자는 밝혀지지 않았다.
Scores: [[0.6719879508018494, 0.086091548204422, 0.018563617020845413, 0.06782062351703644, 0.047260452061891556, 0.10827581584453583]]
Predicted label: 0

Text: 산길을 걷던 남자는 우연히 떨어진 그림엽서를 발견했다. 그림 속에는 자신이 어릴 적 갔던 바다가 그려져 있었다. 뒷면에는 낯익은 필체로 '다시 만나자'라는 문구가 적혀 있었다. 누군가의 초대 같았다.
Scores:

In [24]:
# 결과 출력
output_file = './results/using_test_data.json'
with open(output_file, 'w', encoding='utf-8') as json_f:
    json.dump(data, json_f, ensure_ascii=False, indent=4)