<a href="https://colab.research.google.com/github/choijonghong/transformer/blob/main/%ED%8A%B8%EB%9E%9C%EC%8A%A4%ED%8F%AC%EB%A8%B8%2B%EA%B0%90%EC%A0%95%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================================
# 0. 환경 준비
# ==========================================
import torch, torch.nn as nn, torch.nn.functional as F
import math, random, re
import pandas as pd
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ==========================================
# 1. NSMC 데이터 다운로드
# ==========================================
!wget -nc https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
!wget -nc https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt

train_df = pd.read_table("ratings_train.txt")
test_df  = pd.read_table("ratings_test.txt")

print("훈련 샘플 수:", len(train_df))
print("테스트 샘플 수:", len(test_df))
print(train_df.head())

# ==========================================
# 2. 토크나이저 & Vocab 구축
# ==========================================
def simple_tokenizer(text):
    # 한글/영문/숫자만 남기고 띄어쓰기 기준 토큰화
    return re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z ]", "", str(text)).split()

counter = Counter()
for doc in train_df['document'].dropna():
    counter.update(simple_tokenizer(doc))

# 최소 5번 이상 나온 단어만 vocab에 포함
min_freq = 5
vocab = {"<pad>":0, "<unk>":1}
for word, freq in counter.items():
    if freq >= min_freq and word not in vocab:
        vocab[word] = len(vocab)

print("Vocab size:", len(vocab))

def encode(line, max_len=50):
    tokens = [vocab.get(tok, vocab["<unk>"]) for tok in simple_tokenizer(line)]
    if len(tokens) < max_len:
        tokens += [vocab["<pad>"]] * (max_len - len(tokens))
    return torch.tensor(tokens[:max_len], dtype=torch.long)

train_data = [(encode(row['document']), row['label']) for _, row in train_df.iterrows() if pd.notna(row['document'])]
test_data  = [(encode(row['document']), row['label']) for _, row in test_df.iterrows() if pd.notna(row['document'])]

print("샘플 인코딩:", train_data[0])

# ==========================================
# 3. Positional Encoding
# ==========================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# ==========================================
# 4. Transformer 분류기 정의
# ==========================================
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=vocab["<pad>"])
        self.pos_enc = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, 256)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = x.transpose(0,1)
        out = self.encoder(x)
        out = out.mean(dim=0)
        return self.fc(out)

# ==========================================
# 5. 학습 준비
# ==========================================
model = TransformerClassifier(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

BATCH_SIZE = 64
EPOCHS = 2  # Colab에서는 1~2 epoch만 돌려도 확인 가능

def batchify(data, bsz):
    random.shuffle(data)
    for i in range(0, len(data), bsz):
        batch = data[i:i+bsz]
        X = torch.stack([x for x,y in batch]).to(device)
        y = torch.tensor([y for x,y in batch], dtype=torch.long).to(device)
        yield X, y

# ==========================================
# 6. 학습 루프
# ==========================================
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for X,y in batchify(train_data, BATCH_SIZE):
        out = model(X)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss {total_loss:.4f}")

# ==========================================
# 7. 평가
# ==========================================
model.eval()
correct = 0
with torch.no_grad():
    for X,y in batchify(test_data, BATCH_SIZE):
        pred = model(X).argmax(dim=1)
        correct += (pred == y).sum().item()
print("테스트 정확도:", correct / len(test_data))

# ==========================================
# 8. 직접 문장 예측
# ==========================================
def predict_sentiment(sentence):
    model.eval()
    X = encode(sentence).unsqueeze(0).to(device)
    pred = model(X).argmax(dim=1).item()
    return "긍정" if pred==1 else "부정"

print(predict_sentiment("이 영화 아주 재미있다."))
print(predict_sentiment("이 영화 정말 최악이다."))


Device: cuda
--2025-08-25 05:10:29--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt’


2025-08-25 05:10:30 (143 MB/s) - ‘ratings_train.txt’ saved [14628807/14628807]

--2025-08-25 05:10:30--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4893335 (4.7M) [application/octet-stream]
Saving to: ‘ratings_test.txt’


2025-08-25 05:10:30 (104 MB



Epoch 1, Loss 1246.8972
Epoch 2, Loss 998.1554
테스트 정확도: 0.7792667560053603
긍정
부정


In [3]:
print(predict_sentiment("정말지겨운 영화...이런거 누가 만들었어."))
print(predict_sentiment("스토리도 별로고 지루했다."))


부정
부정
