In [10]:
# !pip install -q pandas numpy scikit-learn transformers torch
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


In [2]:
import os
import glob
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
import torch

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def extract_aspects_from_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    records = []

    # data가 dict이면 단일 처리, list면 각 원소 처리
    data_items = data if isinstance(data, list) else [data]

    for item in data_items:
        aspects_list = item.get("Aspects", [])  # 각 item이 dict라고 가정
        for aspect in aspects_list:
            records.append({
                "text": aspect.get("SentimentText", ""),
                "aspect": aspect.get("Aspect", "Unknown"),
                "polarity": int(aspect.get("SentimentPolarity", 0))
            })
    return records


In [6]:
def load_dataset(folder_path):
    all_data = []
    for file_path in glob.glob(os.path.join(folder_path, "**", "*.json"), recursive=True):
        all_data.extend(extract_aspects_from_json(file_path))
    return pd.DataFrame(all_data)


In [12]:
# 구글 드라이브에 저장한 parquet 파일 불러오기
df_train = pd.read_parquet("./train.parquet", engine="fastparquet")
df_val = pd.read_parquet("./val.parquet", engine="fastparquet")

print("Training 데이터 개수:", len(df_train))
print("Validation 데이터 개수:", len(df_val))

Training 데이터 개수: 629755
Validation 데이터 개수: 85611


# 라벨 인코딩 진행

In [13]:
from sklearn.preprocessing import LabelEncoder

# Aspect 인코딩
aspect_encoder = LabelEncoder()
df_train["aspect_label"] = aspect_encoder.fit_transform(df_train["aspect"])
df_val["aspect_label"] = aspect_encoder.transform(df_val["aspect"])

# Polarity 인코딩
polarity_mapping = {-1:0, 0:1, 1:2}
df_train["polarity_label"] = df_train["polarity"].map(polarity_mapping)
df_val["polarity_label"] = df_val["polarity"].map(polarity_mapping)


# 토크나이징

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

train_encodings = tokenizer(
    df_train["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=128
)
val_encodings = tokenizer(
    df_val["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=128
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

# Dataset 객체

In [15]:
import torch

class AspectDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, aspect_labels, polarity_labels):
        self.encodings = encodings
        self.aspect_labels = aspect_labels
        self.polarity_labels = polarity_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["aspect_labels"] = torch.tensor(self.aspect_labels[idx])
        item["polarity_labels"] = torch.tensor(self.polarity_labels[idx])
        return item

    def __len__(self):
        return len(self.aspect_labels)

train_dataset = AspectDataset(train_encodings, df_train["aspect_label"].tolist(), df_train["polarity_label"].tolist())
val_dataset = AspectDataset(val_encodings, df_val["aspect_label"].tolist(), df_val["polarity_label"].tolist())


# DataLoader 생성

In [28]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=160 , shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=160 , shuffle=False)


# 모델 정의

In [29]:
import torch
import torch.nn as nn
from transformers import AutoModel

class KoELECTRAMultiTask(nn.Module):
    def __init__(self, model_name, num_aspect_classes, num_polarity_classes):
        super().__init__()
        self.koelectra = AutoModel.from_pretrained(model_name)
        hidden_size = self.koelectra.config.hidden_size

        # Aspect 분류기
        self.aspect_classifier = nn.Linear(hidden_size, num_aspect_classes)
        # Polarity 분류기
        self.polarity_classifier = nn.Linear(hidden_size, num_polarity_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.koelectra(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:,0]  # [CLS] token 사용

        aspect_logits = self.aspect_classifier(pooled_output)
        polarity_logits = self.polarity_classifier(pooled_output)

        return aspect_logits, polarity_logits


# 학습 준비

In [30]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_aspect_classes = len(aspect_encoder.classes_)
num_polarity_classes = 3

model = KoELECTRAMultiTask("monologg/koelectra-base-v3-discriminator",
                           num_aspect_classes, num_polarity_classes)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()


# 학습 진행과 평가

In [None]:
import torch
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

epochs = 100
train_losses = []
val_aspect_accs = []
val_polarity_accs = []

# Early Stopping 설정
best_aspect_acc = 0.0   # 최고 Aspect Accuracy
patience = 5            # patience 동안 개선 없으면 종료
counter = 0             # 개선 없을 때 증가

for epoch in range(epochs):
    # --- Training ---
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        aspect_labels = batch['aspect_labels'].to(device)
        polarity_labels = batch['polarity_labels'].to(device)

        aspect_logits, polarity_logits = model(input_ids, attention_mask)

        loss_aspect = criterion(aspect_logits, aspect_labels)
        loss_polarity = criterion(polarity_logits, polarity_labels)
        loss = loss_aspect + loss_polarity

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    # --- Validation ---
    model.eval()
    all_aspect_preds, all_aspect_labels = [], []
    all_polarity_preds, all_polarity_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            aspect_labels = batch['aspect_labels'].to(device)
            polarity_labels = batch['polarity_labels'].to(device)

            aspect_logits, polarity_logits = model(input_ids, attention_mask)
            aspect_preds = torch.argmax(aspect_logits, dim=1)
            polarity_preds = torch.argmax(polarity_logits, dim=1)

            all_aspect_preds.extend(aspect_preds.cpu().numpy())
            all_aspect_labels.extend(aspect_labels.cpu().numpy())
            all_polarity_preds.extend(polarity_preds.cpu().numpy())
            all_polarity_labels.extend(polarity_labels.cpu().numpy())

    aspect_acc = accuracy_score(all_aspect_labels, all_aspect_preds)
    polarity_acc = accuracy_score(all_polarity_labels, all_polarity_preds)
    val_aspect_accs.append(aspect_acc)
    val_polarity_accs.append(polarity_acc)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, "
          f"Aspect Acc: {aspect_acc:.4f}, Polarity Acc: {polarity_acc:.4f}")

    # --- 모델 저장 (모든 epoch) ---
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_loss,
    }, f"checkpoint_epoch{epoch+1}.pt")

    # --- Early Stopping 체크 ---
    if aspect_acc > best_aspect_acc:
        best_aspect_acc = aspect_acc
        counter = 0
        # 가장 좋은 모델 저장
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, "best_model.pt")
    else:
        counter += 1

    if counter >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break

# --- 학습 곡선 그리기 ---
plt.figure(figsize=(10,5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_aspect_accs, label="Aspect Acc")
plt.plot(val_polarity_accs, label="Polarity Acc")
plt.xlabel("Epoch")
plt.ylabel("Value")
plt.legend()
plt.title("Training Curve")
plt.savefig("training_curve.png")
plt.show()


Epoch 1/100, Loss: 1.4029, Aspect Acc: 0.7931, Polarity Acc: 0.9574
