필요 라이브러리 설치

In [11]:
# (필요 시, colab/로컬 환경에서 먼저 설치)
%pip install transformers torch tqdm --quiet
%pip install ipywidgets --upgrade
!jupyter nbextension enable --py widgetsnbextension

import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.optim import AdamW
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
import os


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoot trust

Jupyter command `jup

하이퍼파라미터 및 경로 세팅

In [12]:
# 경로 및 세팅
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
BATCH_SIZE = 64
EPOCHS = 4
LR = 2e-5
MAX_LEN = 64

TRAIN_PATH = "/mnt/data/train.json"
VALID_PATH = "/mnt/data/valid.json"
TEST_PATH  = "/mnt/data/test_cls.json"
OUTPUT_PATH = "../outputs/cls_output.json"
MODEL_SAVE_PATH = "./koelectra_cls.pt"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


데이터셋 클래스 정의

In [13]:
class QCDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, with_label=True):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.with_label = with_label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoded = self.tokenizer(
            item['question'],
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        inputs = {k: v.squeeze() for k, v in encoded.items()}
        if self.with_label and 'label' in item:
            inputs['labels'] = torch.tensor(item['label'])
        return inputs


데이터 로드/토크나이저 준비

In [14]:
# 데이터 로드
def load_json(path):
    with open(path, encoding='utf-8') as f:
        return json.load(f)

train_data = load_json("../data/train.json")
valid_data = load_json("../data/valid.json")
test_data  = load_json("../data/test_cls.json")

tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME)

train_ds = QCDataset(train_data, tokenizer, MAX_LEN)
valid_ds = QCDataset(valid_data, tokenizer, MAX_LEN)
test_ds  = QCDataset(test_data, tokenizer, MAX_LEN, with_label=False)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)


모델 준비

In [15]:
model = ElectraForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
model = model.to(DEVICE)


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


학습 함수

In [16]:
def train_one_epoch(model, dataloader, optimizer):
    model.train()
    losses = []
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
    return np.mean(losses)


검증 함수

In [17]:
def eval_model(model, dataloader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits.cpu().numpy()
            batch_preds = np.argmax(logits, axis=1)
            preds.extend(batch_preds)
            trues.extend(labels.cpu().numpy())
    acc = accuracy_score(trues, preds)
    f1 = f1_score(trues, preds, average='macro')
    return acc, f1


전체 학습 루프

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

best_f1 = 0
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    train_loss = train_one_epoch(model, train_loader, optimizer)
    acc, f1 = eval_model(model, valid_loader)
    print(f"Train loss: {train_loss:.4f} | Valid Acc: {acc:.4f} | Valid F1: {f1:.4f}")
    if f1 > best_f1:
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        best_f1 = f1
        print("Best model saved!")


Epoch 1/4


  0%|          | 0/719 [00:00<?, ?it/s]

Train loss: 0.1564 | Valid Acc: 0.9069 | Valid F1: 0.9015
Best model saved!
Epoch 2/4


  0%|          | 0/719 [00:00<?, ?it/s]

Train loss: 0.0053 | Valid Acc: 0.8777 | Valid F1: 0.8743
Epoch 3/4


  0%|          | 0/719 [00:00<?, ?it/s]

Train loss: 0.0012 | Valid Acc: 0.9307 | Valid F1: 0.9293
Best model saved!
Epoch 4/4


  0%|          | 0/719 [00:00<?, ?it/s]

Train loss: 0.0003 | Valid Acc: 0.9375 | Valid F1: 0.9359
Best model saved!


추론 및 결과 저장

In [19]:
def refine_label(question, predicted_label):
    q = question.lower()

    # 1. 장학금 복합 케이스
    if '장학' in q:
        if any(k in q for k in ['발표', '결과', '공지', '수혜자']):
            return 1
        if any(k in q for k in ['기간', '일정', '날짜', '신청', '수령', '방법']):
            return 2
        if any(k in q for k in ['대상', '기준', '자격', '조건']):
            return 0

    # 2. 강의평 관련
    if '강의평' in q or '강의 평가' in q:
        if any(k in q for k in ['어디', '사이트', '조회', '확인']):
            return 1
        return 0

    # 3. 상세 절차/방법 안내
    if any(k in q for k in ['방법', '절차', '이용법', '이용 방법', '안내해', '알려주']):
        if '도서관' in q or '시설' in q or '예약' in q or '복사기' in q:
            return 1
        if '셔틀' in q or '버스' in q:
            return 4
        if '졸업' in q or '전공' in q or '학점' in q:
            return 0
        if '신청' in q or '수강' in q:
            return 2

    # 4. 수강신청 관련
    if '수강신청' in q:
        if any(k in q for k in ['기간', '일정', '정정', '대기', '팁', '사이트', '오류']):
            return 2

    # 5. 공지/알림/공지사항
    if any(k in q for k in ['공지', '알림', '안내', '공지사항', '공고']):
        return 1

    # 6. 학식/식단
    if any(k in q for k in ['학식', '식단', '메뉴', '밥', '중식', '석식', '아침', '점심', '석식']):
        return 3

    # 7. 셔틀/교통
    if any(k in q for k in ['셔틀', '버스', '정류장', '교통', '노선', '막차', '위치']):
        return 4

    # 8. 일정/기간/날짜/시험/등록금/휴강/복학/방학/출석/중간고사/기말고사
    if any(k in q for k in ['일정', '기간', '날짜', '시간', '시험', '등록금', '휴강', '복학', '방학', '출석', '중간', '기말']):
        return 2

    # 9. 졸업/전공/학점/요건/논문/유예/인증/필수/인턴/교환학생
    if any(k in q for k in ['졸업', '전공', '학점', '교양', '요건', '논문', '유예', '인증', '필수', '인턴', '교환학생', '동아리', '멘토링', '실습']):
        return 0

    # 10. 도서관/예약/자리/좌석/복사기 등
    if any(k in q for k in ['도서관', '예약', '자리', '좌석', '복사기']):
        return 1

    # 11. 와이파이/비번/인터넷/접속
    if any(k in q for k in ['와이파이', '비번', '인터넷', '접속']):
        return 1

    # 12. 캠퍼스 지도, 상벌점, 학생증 등도 안내(1)
    if any(k in q for k in ['지도', '상벌점', '학생증']):
        return 1

    # 나머지는 기존 예측 유지
    return predicted_label


In [19]:
# 베스트 모델 로드
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()

# 기존 코드와 다르게 enumerate(test_loader)로 전체 인덱스 추적
results = []
cur_idx = 0  # 전체 test_data 인덱스

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        preds = np.argmax(logits, axis=1)
        batch_size = len(preds)
        for i in range(batch_size):
            question = test_data[cur_idx]['question']
            pred = int(preds[i])
            #refined_pred = refine_label(question, pred)  # <- 여기!
            results.append({
                "question": question,
                "label": pred # refined_pred
            })
            cur_idx += 1  # 전체 test 인덱스만 누적!


# json 저장
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Saved predictions to {OUTPUT_PATH}")


  model.load_state_dict(torch.load(MODEL_SAVE_PATH))


  0%|          | 0/10 [00:00<?, ?it/s]

Saved predictions to ../outputs/cls_output.json


검증 데이터 추가 평가

In [20]:
# valid set 성능 재확인
trues, preds = [], []
model.eval()
with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask)
        batch_preds = np.argmax(outputs.logits.cpu().numpy(), axis=1)
        preds.extend(batch_preds)
        trues.extend(labels.cpu().numpy())

print("Valid Accuracy:", accuracy_score(trues, preds))
print("Valid Macro F1:", f1_score(trues, preds, average='macro'))


Valid Accuracy: 0.9375
Valid Macro F1: 0.9358911968896706
