In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import time
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# ✅ MPS 디바이스 설정
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# ✅ CNN+LSTM 모델 정의
class CNNLSTMSentimentClassifier(nn.Module):
    def __init__(self, vocab_size=30522, embedding_dim=64, hidden_dim=32, num_classes=3, dropout=0.5, padding_idx=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.conv = nn.Conv1d(embedding_dim, 64, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(input_size=64, hidden_size=hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids):
        x = self.embedding(input_ids)       # (B, L, E)
        x = x.permute(0, 2, 1)              # (B, E, L)
        x = self.relu(self.conv(x))        # (B, C, L)
        x = x.permute(0, 2, 1)              # (B, L, C)
        lstm_out, _ = self.lstm(x)          # (B, L, H*2)
        pooled = torch.mean(lstm_out, dim=1)
        return self.fc(self.dropout(pooled))

# ✅ Dataset 정의
class DeBERTaSentimentDataset(Dataset):
    def __init__(self, dataframe, max_length=128):
        self.df = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_ids = torch.tensor(row["input_ids"][:self.max_length], dtype=torch.long)
        attention_mask = torch.tensor(row["attention_mask"][:self.max_length], dtype=torch.long)
        label = torch.tensor(row["sentiment_label"], dtype=torch.long)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }

# ✅ 데이터 로드 및 분할
df = pd.read_json("data/merged_0503_tokenized.json", lines=True)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# ✅ DataLoader 구성
train_dataset = DeBERTaSentimentDataset(train_df)
val_dataset = DeBERTaSentimentDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# ✅ 모델, 옵티마이저, 손실함수
model = CNNLSTMSentimentClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# ✅ 학습 함수 (ETA 포함)
def train_model(model, dataloader, optimizer, criterion, device, epochs=3):
    model.train()
    for epoch in range(1, epochs+1):
        total_loss = 0
        start_time = time.time()
        total = len(dataloader)
        pbar = tqdm(enumerate(dataloader), total=total, desc=f"📘 Epoch {epoch}/{epochs}")
        for step, batch in pbar:
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            elapsed = time.time() - start_time
            eta = (elapsed / (step + 1)) * (total - step - 1)
            pbar.set_postfix({
                "Loss": f"{loss.item():.4f}",
                "Elapsed": f"{elapsed:.1f}s",
                "ETA": f"{eta:.1f}s"
            })

        avg_loss = total_loss / total
        print(f"✅ Epoch {epoch} 완료 | 평균 Loss: {avg_loss:.4f} | 총 소요: {time.time() - start_time:.1f}s")


In [2]:
# ✅ 학습 시작
train_model(model, train_loader, optimizer, criterion, device, epochs=3)

📘 Epoch 1/3: 100%|██████████| 9903/9903 [06:19<00:00, 26.10it/s, Loss=0.9981, Elapsed=379.4s, ETA=0.0s]  


✅ Epoch 1 완료 | 평균 Loss: 1.0055 | 총 소요: 379.4s


📘 Epoch 2/3: 100%|██████████| 9903/9903 [06:28<00:00, 25.47it/s, Loss=0.8069, Elapsed=388.8s, ETA=0.0s]  


✅ Epoch 2 완료 | 평균 Loss: 0.9342 | 총 소요: 388.8s


📘 Epoch 3/3: 100%|██████████| 9903/9903 [06:10<00:00, 26.70it/s, Loss=0.8779, Elapsed=370.9s, ETA=0.0s]  

✅ Epoch 3 완료 | 평균 Loss: 0.8996 | 총 소요: 370.9s





In [3]:
from sklearn.metrics import classification_report, accuracy_score

# ✅ 검증 함수 (Validation or Test)
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="🔍 Evaluating"):
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)
            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=["negative", "neutral", "positive"])
    print(f"\n✅ 정확도: {acc:.4f}\n")
    print("📋 분류 리포트:\n")
    print(report)

In [5]:
# 학습 후 검증 세트로 성능 평가
# evaluate_model(model, val_loader, device)

# 또는 최종 테스트 세트 평가
test_dataset = DeBERTaSentimentDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
evaluate_model(model, test_loader, device)

🔍 Evaluating: 100%|██████████| 1238/1238 [00:17<00:00, 72.16it/s]



✅ 정확도: 0.5625

📋 분류 리포트:

              precision    recall  f1-score   support

    negative       0.65      0.66      0.65     11231
     neutral       0.46      0.36      0.40     13972
    positive       0.58      0.68      0.63     14407

    accuracy                           0.56     39610
   macro avg       0.56      0.57      0.56     39610
weighted avg       0.55      0.56      0.55     39610

