## NSMC(Naver Sentiment Movie Corpis
네이버 영화 리뷰 말뭉치<br>
from Korpora import Korpora


In [4]:
from dataclasses import dataclass

@dataclass
class ClassificationTrainArguments:
    pretrained_model_name: str
    downstream_corpus_name: str
    downstream_corpus_root_dir: str
    downstream_model_dir: str
    learning_rate: float
    batch_size: int

args = ClassificationTrainArguments(
    pretrained_model_name="beomi/kcbert-base",#사전 학습된 모델, Hugging Face의 모델 허브
    downstream_corpus_name="nsmc", #네이버 corpus 다운
    downstream_corpus_root_dir="./data",
    downstream_model_dir="./model",
    learning_rate=5e-5,
    batch_size=32
)

In [6]:
from Korpora import Korpora

Korpora.fetch(
    corpus_name=args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    force_download=True,
)

[nsmc] download ratings_train.txt: 14.6MB [00:01, 11.2MB/s]                     
[nsmc] download ratings_test.txt: 4.90MB [00:00, 10.1MB/s]                      


In [7]:
#kcbert-base 모델 준비
from transformers import BertConfig, BertForSequenceClassification
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=2,
)
model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config=pretrained_model_config,
)

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenizer
토큰화 수행 프로그램<br>
kcbert-base 모델

In [8]:
#kcbert-base 토크나이저 준비
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False, #대소문자 유지, True면 모든 입력 소문자로 변환
) 

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

# Pytorch's Data Loader

* 파이토치로 딥러닝 모델을 만들려면 반드시 정의해야 한다.
* 데이터를 배치(batch)단위로 모델에 밀어 넣어주는 역할
* 전체 데이터 가운데 일부 인스턴스를 뽑아 배치를 구성
* 데이터셋은 데이터 로더의 구성 요소 중 하나
* 데이터셋은 여러 인스턴스를 보유

데이터 로더 > 데이터셋 > 인스턴스

* batch는 그 모양이 고정적이어야 할 때가 많다. -> 문장들의 토큰(input_ids) 개수가 같아야 한다.

그래서 batch의 shape을 동일하게 만들어 주는 과정을 collate라고 한다.

### Collate
* list -> pytorch의 tensor로 변환
* batch size 통일

In [None]:
# NSMC 데이터 다운로드 및 로드
data_dir = "./data/nsmc"  # 데이터 저장 경로
os.makedirs(data_dir, exist_ok=True)

train_file = f"{data_dir}/ratings_train.txt"
test_file = f"{data_dir}/ratings_test.txt"

if not os.path.exists(train_file):
    from Korpora import Korpora
    Korpora.fetch("nsmc", root_dir=data_dir)



In [11]:
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import BertTokenizer
import pandas as pd
import os

# 데이터 불러오기 (NSMC 데이터셋은 탭(`\t`)으로 구분된 txt 파일)
train_df = pd.read_csv(train_file, sep="\t").dropna()
test_df = pd.read_csv(test_file, sep="\t").dropna()

# PyTorch Dataset 클래스 정의
class NsmcDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]["document"])
        label = int(self.df.iloc[idx]["label"])

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # 텐서 변환
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# 5️⃣ 데이터셋 생성
train_dataset = NsmcDataset(train_df, tokenizer)
test_dataset = NsmcDataset(test_df, tokenizer)

# 6️⃣ DataLoader 생성
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,  # args.batch_size
    sampler=RandomSampler(train_dataset, replacement=False),
    drop_last=False,
    num_workers=0,  # args.cpu_workers
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,  # 테스트 데이터는 랜덤 샘플링 불필요
    drop_last=False,
    num_workers=0,
)

# ✅ 데이터 샘플 확인
sample = next(iter(train_dataloader))
print(sample["input_ids"].shape)  # torch.Size([32, 128])
print(sample["label"])  # tensor([0, 1, 1, ...]) (배치 크기 32개 레이블)

torch.Size([32, 128])
tensor([0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
        1, 0, 0, 1, 0, 1, 0, 1])


### Pytorch Lightning
https://minjoo-happy-blog.tistory.com/140

In [13]:
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AdamW

# PyTorch Lightning을 사용한 감성 분석 모델 정의
class SentimentClassificationTask(pl.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()
        self.model = model
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask=attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        outputs = self(input_ids, attention_mask)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels)

        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.learning_rate)

# 모델 및 학습 설정
task = SentimentClassificationTask(model, learning_rate=args.learning_rate)

# PyTorch Lightning Trainer 설정
trainer = pl.Trainer(
    max_epochs=3,  # 학습 횟수
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    log_every_n_steps=10
)

# 모델 학습 실행
trainer.fit(
    task,
    train_dataloaders=train_dataloader,
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/homebrew/anaconda3/envs/myenv/lib/python3.9/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

  | Name  | Type                          | Params | Mode
---------------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M  | eval
---------------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.680   Total estimated model params size (MB)
0         Modules in train mode
231       Modules in eval mode
/opt/homebrew/anaconda3/envs/myenv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` 

Training: |                                               | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined