In [6]:
# !pip install datasets
# !pip install transformers

In [7]:

import time
import numpy as np
import pandas as pd

import tensorflow as tf
import torch

import re
import os

import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence, Dataset

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split as tts

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import matplotlib.pyplot as plt

print('torch version:', torch.__version__)
print('tf version:', tf.__version__)

torch version: 1.12.1+cu113
tf version: 2.9.2


In [8]:
# random seed fix
import random

random.seed(2022)
torch.manual_seed(2022)
np.random.seed(2022)

In [14]:
# data load
sentiment_data = pd.read_csv('/content/data_labeling.csv', encoding='cp949')
sentiment_data.head()

Unnamed: 0,content,emotion,label
0,아내가 드디어 출산하게 되어서 정말 신이 나,기쁨,0
1,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야,긴장,1
2,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워,긴장,1
3,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야 너무 행복해,기쁨,0
4,이제 곧 은퇴할 시기가 되었어 내가 먼저 은퇴를 하고 육 개월 후에 남편도 은퇴를 ...,긴장,1


In [15]:
sentiment_data.isna().sum()

content    0
emotion    0
label      0
dtype: int64

In [16]:
sentiment_data.drop('emotion', axis=1, inplace=True)
sentiment_data.head()

Unnamed: 0,content,label
0,아내가 드디어 출산하게 되어서 정말 신이 나,0
1,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야,1
2,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워,1
3,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야 너무 행복해,0
4,이제 곧 은퇴할 시기가 되었어 내가 먼저 은퇴를 하고 육 개월 후에 남편도 은퇴를 ...,1


In [19]:
# data split
train_data, test_data = tts(sentiment_data, test_size=0.2)
print(train_data.shape, test_data.shape)

(66088, 2) (16522, 2)


In [20]:
# model setting
num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-large', num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'cl

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [21]:
# preprocessing function
def preprocess_function(data):
    return tokenizer(
        # tokenizing
        data['content'],

        max_length=512,

        # 최대 길이보다 긴 시퀀스는 최대 길이에 맞춰 자름
        truncation=True,

        # tokenizer가 token_type_ids를 return하지 않게 함
        # roberta는 필요없기 때문
        return_token_type_ids=False,
    )


In [22]:
# dataset transform
df_train = pd.DataFrame({'content':train_data['content'], 'label':train_data['label']})
dataset_train = Dataset.from_pandas(df_train)

df_val = pd.DataFrame({'content':test_data['content'], 'label':test_data['label']})
dataset_val = Dataset.from_pandas(df_val)

In [23]:
tokenized_train_datasets = dataset_train.map(preprocess_function, batched=True)
tokenized_val_datasets = dataset_val.map(preprocess_function, batched=True)

  0%|          | 0/67 [00:00<?, ?ba/s]

  0%|          | 0/17 [00:00<?, ?ba/s]

In [26]:
def eval_metric(pred, real):
    f1 = {'micro_f1':f1_score(real, pred, average='micro')}
    return f1

# eval metric
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = eval_metric(predictions, labels)
    return result

In [27]:
metric_name = "micro_f1"

# batch size 지정
batch_size = 8
num_train_epochs = 20

# path 설정
trained_model_path = '/content'

args = TrainingArguments(
    output_dir=trained_model_path,
    overwrite_output_dir=True,
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    learning_rate=1e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    save_total_limit=2,
    tpu_num_cores = 85,
    seed = 2022,
    data_seed = 2022,
    dataloader_pin_memory = True,
    optim = 'adafactor'
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, content. If __index_level_0__, content are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 66088
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 165220
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Micro F1
500,1.5278,1.262699,0.522879
1000,1.0991,0.974597,0.633398
1500,0.9866,0.908454,0.653553
2000,0.8933,0.876664,0.667776
2500,0.8783,0.847158,0.679458
3000,0.8394,0.833992,0.685268
3500,0.8426,0.813092,0.695678
4000,0.7931,0.798448,0.701005
4500,0.8144,0.798859,0.696647
5000,0.7879,0.810565,0.699007


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, content. If __index_level_0__, content are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 16522
  Batch size = 8
Saving model checkpoint to /content/checkpoint-500
Configuration saved in /content/checkpoint-500/config.json
Model weights saved in /content/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, content. If __index_level_0__, content are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignor

TrainOutput(global_step=15000, training_loss=0.8082748616536458, metrics={'train_runtime': 9640.9298, 'train_samples_per_second': 137.099, 'train_steps_per_second': 17.137, 'total_flos': 7194024894862080.0, 'train_loss': 0.8082748616536458, 'epoch': 1.82})