In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import time

import pandas as pd
from ast import literal_eval

In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# ✅ 1. 파일 로드
df = pd.read_json("data/merged_0503_tokenized.json", lines=True)

# ✅ 2. 타입 확인 (이미 리스트인지 확인하고 그대로 사용)
assert isinstance(df["input_ids"].iloc[0], list), "input_ids가 리스트 타입이 아님"

# ✅ 3. Dataset 정의
class DeBERTaSentimentDataset(Dataset):
    def __init__(self, dataframe, max_length=128):
        self.df = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_ids = torch.tensor(row["input_ids"][:self.max_length], dtype=torch.long)
        attention_mask = torch.tensor(row["attention_mask"][:self.max_length], dtype=torch.long)
        label = torch.tensor(row["sentiment_label"], dtype=torch.long)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }

# ✅ 4. DataLoader 구성
dataset = DeBERTaSentimentDataset(df)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [9]:
# ✅ 디바이스 설정 (GPU는 무시하고 MPS or CPU만 사용)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"✅ 사용 디바이스: {device}")

# ✅ CNN 모델 정의
class CNNSentimentClassifier(nn.Module):
    def __init__(self, vocab_size=30522, embedding_dim=300, num_classes=3, dropout=0.5, padding_idx=1):
        super(CNNSentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=300, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveMaxPool1d(output_size=1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(300, num_classes)

    def forward(self, input_ids):
        x = self.embedding(input_ids)        # (B, L, E)
        x = x.permute(0, 2, 1)               # (B, E, L)
        x = F.relu(self.conv(x))            # (B, C, L)
        x = self.pool(x).squeeze(-1)        # (B, C)
        x = self.dropout(x)
        return self.fc(x)                   # (B, num_classes)

# ✅ 학습 함수
def train_model(model, dataloader, optimizer, criterion, device, epochs=1):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        print(f"\n📘 Epoch {epoch+1}/{epochs}")
        total_loss = 0
        start_time = time.time()

        pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc="Training")
        for step, batch in pbar:
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            avg_loss = total_loss / (step + 1)
            elapsed = time.time() - start_time
            eta = (elapsed / (step + 1)) * (len(dataloader) - step - 1)

            pbar.set_postfix({
                "Loss": f"{avg_loss:.4f}",
                "Elapsed": f"{elapsed:.1f}s",
                "ETA": f"{eta:.1f}s"
            })

        print(f"✅ Epoch {epoch+1} 완료 | 평균 Loss: {avg_loss:.4f} | 총 소요: {elapsed:.1f}s")


✅ 사용 디바이스: mps


In [13]:
model = CNNSentimentClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

train_model(model, dataloader, optimizer, criterion, device, epochs=10)


📘 Epoch 1/10


Training: 100%|██████████| 12379/12379 [05:22<00:00, 38.36it/s, Loss=0.9620, Elapsed=322.7s, ETA=0.0s] 


✅ Epoch 1 완료 | 평균 Loss: 0.9620 | 총 소요: 322.7s

📘 Epoch 2/10


Training: 100%|██████████| 12379/12379 [05:25<00:00, 37.97it/s, Loss=0.8931, Elapsed=326.0s, ETA=0.0s] 


✅ Epoch 2 완료 | 평균 Loss: 0.8931 | 총 소요: 326.0s

📘 Epoch 3/10


Training: 100%|██████████| 12379/12379 [05:28<00:00, 37.71it/s, Loss=0.8747, Elapsed=328.3s, ETA=0.0s] 


✅ Epoch 3 완료 | 평균 Loss: 0.8747 | 총 소요: 328.3s

📘 Epoch 4/10


Training: 100%|██████████| 12379/12379 [05:43<00:00, 36.08it/s, Loss=0.8621, Elapsed=343.1s, ETA=0.0s] 


✅ Epoch 4 완료 | 평균 Loss: 0.8621 | 총 소요: 343.1s

📘 Epoch 5/10


Training: 100%|██████████| 12379/12379 [05:44<00:00, 35.96it/s, Loss=0.8509, Elapsed=344.3s, ETA=0.0s] 


✅ Epoch 5 완료 | 평균 Loss: 0.8509 | 총 소요: 344.3s

📘 Epoch 6/10


Training: 100%|██████████| 12379/12379 [05:39<00:00, 36.45it/s, Loss=0.8407, Elapsed=339.7s, ETA=0.0s] 


✅ Epoch 6 완료 | 평균 Loss: 0.8407 | 총 소요: 339.7s

📘 Epoch 7/10


Training: 100%|██████████| 12379/12379 [05:27<00:00, 37.85it/s, Loss=0.8316, Elapsed=327.1s, ETA=0.0s] 


✅ Epoch 7 완료 | 평균 Loss: 0.8316 | 총 소요: 327.1s

📘 Epoch 8/10


Training: 100%|██████████| 12379/12379 [05:26<00:00, 37.97it/s, Loss=0.8228, Elapsed=326.0s, ETA=0.0s] 


✅ Epoch 8 완료 | 평균 Loss: 0.8228 | 총 소요: 326.0s

📘 Epoch 9/10


Training: 100%|██████████| 12379/12379 [05:12<00:00, 39.55it/s, Loss=0.8141, Elapsed=313.0s, ETA=0.0s] 


✅ Epoch 9 완료 | 평균 Loss: 0.8141 | 총 소요: 313.0s

📘 Epoch 10/10


Training: 100%|██████████| 12379/12379 [05:25<00:00, 38.08it/s, Loss=0.8050, Elapsed=325.1s, ETA=0.0s] 

✅ Epoch 10 완료 | 평균 Loss: 0.8050 | 총 소요: 325.1s





In [14]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

def evaluate_model(model, dataloader, device):
    model.eval()
    model.to(device)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="📊 Evaluating"):
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # sklearn 평가
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=["negative", "neutral", "positive"], digits=4)

    print(f"\n✅ 정확도: {accuracy:.4f}")
    print("\n📋 분류 리포트:\n")
    print(report)

    return accuracy, report


In [15]:
evaluate_model(model, dataloader, device)

📊 Evaluating: 100%|██████████| 12379/12379 [01:47<00:00, 115.66it/s]



✅ 정확도: 0.6569

📋 분류 리포트:

              precision    recall  f1-score   support

    negative     0.7398    0.7431    0.7414    112726
     neutral     0.5732    0.4658    0.5139    139184
    positive     0.6575    0.7741    0.7110    144188

    accuracy                         0.6569    396098
   macro avg     0.6568    0.6610    0.6555    396098
weighted avg     0.6513    0.6569    0.6504    396098



(0.6569359098001,
 '              precision    recall  f1-score   support\n\n    negative     0.7398    0.7431    0.7414    112726\n     neutral     0.5732    0.4658    0.5139    139184\n    positive     0.6575    0.7741    0.7110    144188\n\n    accuracy                         0.6569    396098\n   macro avg     0.6568    0.6610    0.6555    396098\nweighted avg     0.6513    0.6569    0.6504    396098\n')