In [12]:
!pip install pandas numpy scikit-learn transformers torch


Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting transformers
  Using cached transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
Collecting torch
  Using cached torch-2.8.0-cp311-cp311-win_amd64.whl.metadata (30 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.4-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Using cached fsspec-2025.7.0-py3-none-any.whl.metadata (12

In [1]:
import os
import glob
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_aspects_from_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    records = []

    # data가 dict이면 단일 처리, list면 각 원소 처리
    data_items = data if isinstance(data, list) else [data]

    for item in data_items:
        aspects_list = item.get("Aspects", [])  # 각 item이 dict라고 가정
        for aspect in aspects_list:
            records.append({
                "text": aspect.get("SentimentText", ""),
                "aspect": aspect.get("Aspect", "Unknown"),
                "polarity": int(aspect.get("SentimentPolarity", 0))
            })
    return records


In [8]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-win_amd64.whl.metadata (4.3 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp311-cp311-win_amd64.whl.metadata (681 bytes)
Downloading fastparquet-2024.11.0-cp311-cp311-win_amd64.whl (671 kB)
   ---------------------------------------- 0.0/671.0 kB ? eta -:--:--
   ---------------------------------------- 671.0/671.0 kB 6.5 MB/s eta 0:00:00
Downloading cramjam-2.11.0-cp311-cp311-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 10.3 MB/s eta 0:00:00
Installing collected packages: cramjam, fastparquet

   -------------------- ------------------- 1/2 [fastparquet]
   ---------------------------------------- 2/2 [fastparquet]

Successfully installed cramjam-2.11.0 fastparquet-2024.11.0


In [10]:
def load_dataset(folder_path):
    all_data = []
    for file_path in glob.glob(os.path.join(folder_path, "**", "*.json"), recursive=True):
        all_data.extend(extract_aspects_from_json(file_path))
    return pd.DataFrame(all_data)

# Training / Validation
df_train = load_dataset("./dataset/Training")
df_val = load_dataset("./dataset/Validation")

print("Training 데이터 개수:", len(df_train))
print("Validation 데이터 개수:", len(df_val))

# === 여기서 병합 저장 ===
df_train.to_parquet("train.parquet", index=False, engine="fastparquet")
df_val.to_parquet("val.parquet", index=False, engine="fastparquet")

Training 데이터 개수: 629755
Validation 데이터 개수: 85611


# 라벨 인코딩 진행

In [20]:
from sklearn.preprocessing import LabelEncoder

# Aspect 인코딩
aspect_encoder = LabelEncoder()
df_train["aspect_label"] = aspect_encoder.fit_transform(df_train["aspect"])
df_val["aspect_label"] = aspect_encoder.transform(df_val["aspect"])

# Polarity 인코딩
polarity_mapping = {-1:0, 0:1, 1:2}
df_train["polarity_label"] = df_train["polarity"].map(polarity_mapping)
df_val["polarity_label"] = df_val["polarity"].map(polarity_mapping)


# 토크나이징

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

train_encodings = tokenizer(
    df_train["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=128
)
val_encodings = tokenizer(
    df_val["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=128
)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


# Dataset 객체

In [22]:
import torch

class AspectDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, aspect_labels, polarity_labels):
        self.encodings = encodings
        self.aspect_labels = aspect_labels
        self.polarity_labels = polarity_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["aspect_labels"] = torch.tensor(self.aspect_labels[idx])
        item["polarity_labels"] = torch.tensor(self.polarity_labels[idx])
        return item

    def __len__(self):
        return len(self.aspect_labels)

train_dataset = AspectDataset(train_encodings, df_train["aspect_label"].tolist(), df_train["polarity_label"].tolist())
val_dataset = AspectDataset(val_encodings, df_val["aspect_label"].tolist(), df_val["polarity_label"].tolist())


# DataLoader 생성

In [23]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


# 모델 정의

In [24]:
import torch
import torch.nn as nn
from transformers import AutoModel

class KoELECTRAMultiTask(nn.Module):
    def __init__(self, model_name, num_aspect_classes, num_polarity_classes):
        super().__init__()
        self.koelectra = AutoModel.from_pretrained(model_name)
        hidden_size = self.koelectra.config.hidden_size
        
        # Aspect 분류기
        self.aspect_classifier = nn.Linear(hidden_size, num_aspect_classes)
        # Polarity 분류기
        self.polarity_classifier = nn.Linear(hidden_size, num_polarity_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.koelectra(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:,0]  # [CLS] token 사용

        aspect_logits = self.aspect_classifier(pooled_output)
        polarity_logits = self.polarity_classifier(pooled_output)

        return aspect_logits, polarity_logits


# 학습 준비

In [28]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_aspect_classes = len(aspect_encoder.classes_)
num_polarity_classes = 3

model = KoELECTRAMultiTask("monologg/koelectra-base-v3-discriminator",
                           num_aspect_classes, num_polarity_classes)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()

print(device)


cpu


# 학습 진행

In [27]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_loader, 1):  # enumerate로 step 추적
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        aspect_labels = batch['aspect_labels'].to(device)
        polarity_labels = batch['polarity_labels'].to(device)

        aspect_logits, polarity_logits = model(input_ids, attention_mask)

        loss_aspect = criterion(aspect_logits, aspect_labels)
        loss_polarity = criterion(polarity_logits, polarity_labels)
        loss = loss_aspect + loss_polarity

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # step 단위 출력
        print(f"Epoch {epoch+1}/{epochs}, Step {step}/{len(train_loader)}, Loss: {loss.item():.4f}")

    # epoch 종료 후 평균 loss 출력
    print(f"Epoch {epoch+1} Finished, Average Loss: {total_loss/len(train_loader):.4f}\n")


Epoch 1/3, Step 1/19680, Loss: 4.1516
Epoch 1/3, Step 2/19680, Loss: 3.7671


KeyboardInterrupt: 

# 평가

In [None]:
from sklearn.metrics import accuracy_score

model.eval()
all_aspect_preds = []
all_aspect_labels = []
all_polarity_preds = []
all_polarity_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        aspect_labels = batch['aspect_labels'].to(device)
        polarity_labels = batch['polarity_labels'].to(device)

        aspect_logits, polarity_logits = model(input_ids, attention_mask)
        aspect_preds = torch.argmax(aspect_logits, dim=1)
        polarity_preds = torch.argmax(polarity_logits, dim=1)

        all_aspect_preds.extend(aspect_preds.cpu().numpy())
        all_aspect_labels.extend(aspect_labels.cpu().numpy())
        all_polarity_preds.extend(polarity_preds.cpu().numpy())
        all_polarity_labels.extend(polarity_labels.cpu().numpy())

aspect_acc = accuracy_score(all_aspect_labels, all_aspect_preds)
polarity_acc = accuracy_score(all_polarity_labels, all_polarity_preds)
print(f"Validation Accuracy - Aspect: {aspect_acc:.4f}, Polarity: {polarity_acc:.4f}")
