<a href="https://colab.research.google.com/github/faid011/machine-learning/blob/main/movie_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision torchaudio --quiet

import pandas as pd
import numpy as np
import re
from collections import Counter
import json
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cpu


In [8]:
TRAIN_PATH = "/content/nsmc/ratings_train.txt"
TEST_PATH  = "/content/nsmc/ratings_test.txt"

def preprocess_text_basic(text: str) -> str:
    text = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", text)
    text = re.sub("^ +", "", text)
    text = text.strip()
    return text

def preprocess_data(file_path):
    df = pd.read_csv(file_path, sep="\t")
    df = df.dropna(subset=["document"])
    df = df.drop_duplicates(subset=["document"])
    df["document"] = df["document"].apply(preprocess_text_basic)
    df["document"] = df["document"].replace("", np.nan)
    df = df.dropna(subset=["document"])
    return df

def build_vocab(df, min_freq=2):
    counter = Counter()
    for text in df["document"]:
        tokens = text.split()
        counter.update(tokens)

    # 0: pad, 1: unk
    word2id = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            word2id[word] = len(word2id)
    return word2id

def encode_sentence(text, word2id, max_len):
    tokens = text.split()
    ids = [word2id.get(tok, word2id["<unk>"]) for tok in tokens][:max_len]
    if len(ids) < max_len:
        ids += [word2id["<pad>"]] * (max_len - len(ids))
    return ids

class NSMCDataset(Dataset):
    def __init__(self, dataframe, word2id, max_len):
        self.texts = dataframe["document"].values
        self.labels = dataframe["label"].values.astype("int64")
        self.word2id = word2id
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        input_ids = encode_sentence(text, self.word2id, self.max_len)

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        return input_ids, label

In [9]:
class RNNClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int = 100,
        hidden_dim: int = 128,
        num_layers: int = 1,
        num_classes: int = 2,
        bidirectional: bool = True,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0,
        )

        self.bidirectional = bidirectional
        fc_in_dim = hidden_dim * (2 if bidirectional else 1)
        self.fc = nn.Linear(fc_in_dim, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        output, (h_n, c_n) = self.lstm(emb)

        if self.bidirectional:
            h_forward = h_n[-2, :, :]
            h_backward = h_n[-1, :, :]
            h = torch.cat([h_forward, h_backward], dim=1)
        else:
            h = h_n[-1, :, :]

        logits = self.fc(h)
        return logits

In [10]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for input_ids, labels in dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = torch.argmax(logits, dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / total_samples
    acc = total_correct / total_samples
    return avg_loss, acc

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for input_ids, labels in dataloader:
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            logits = model(input_ids)
            loss = criterion(logits, labels)

            total_loss += loss.item() * labels.size(0)
            preds = torch.argmax(logits, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

    avg_loss = total_loss / total_samples
    acc = total_correct / total_samples
    return avg_loss, acc

MAX_LEN = 80
BATCH_SIZE = 64
EPOCHS = 5
LR = 3e-4
EMBED_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 1
BIDIRECTIONAL = True

print("train 데이터 전처리중...")
train_df = preprocess_data(TRAIN_PATH)
print("test 데이터 전처리중...")
test_df = preprocess_data(TEST_PATH)

print("vocab 생성중...")
word2id = build_vocab(train_df, min_freq=2)
vocab_size = len(word2id)
print("vocab size:", vocab_size)

train_dataset = NSMCDataset(train_df, word2id, MAX_LEN)
test_dataset  = NSMCDataset(test_df,  word2id, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE)

model = RNNClassifier(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    num_classes=2,
    bidirectional=BIDIRECTIONAL,
).to(device)

optimizer = AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    print(f"\n=== Epoch {epoch+1}/{EPOCHS} ===")
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)

    print(f"[Train] loss: {train_loss:.4f}, acc: {train_acc*100:.2f}%")
    print(f"[Test ] loss: {val_loss:.4f}, acc: {val_acc*100:.2f}%")

SAVE_DIR = "/content/rnn_nsmc_model"
os.makedirs(SAVE_DIR, exist_ok=True)
torch.save(model.state_dict(), os.path.join(SAVE_DIR, "model.pt"))
with open(os.path.join(SAVE_DIR, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump(word2id, f, ensure_ascii=False)

print("\n저장 완료:", SAVE_DIR)

train 데이터 전처리중...
test 데이터 전처리중...
vocab 생성중...
vocab size: 67195

=== Epoch 1/5 ===
[Train] loss: 0.5827, acc: 67.10%
[Test ] loss: 0.5493, acc: 70.35%

=== Epoch 2/5 ===
[Train] loss: 0.4729, acc: 76.07%
[Test ] loss: 0.4693, acc: 76.12%

=== Epoch 3/5 ===
[Train] loss: 0.4090, acc: 79.99%
[Test ] loss: 0.4488, acc: 77.50%

=== Epoch 4/5 ===
[Train] loss: 0.3646, acc: 82.65%
[Test ] loss: 0.4496, acc: 77.64%

=== Epoch 5/5 ===
[Train] loss: 0.3263, acc: 84.68%
[Test ] loss: 0.4562, acc: 77.77%

저장 완료: /content/rnn_nsmc_model


In [11]:
# 저장된 모델 / vocab 로드 + 한 줄 예측

def load_vocab_and_model(
    model_dir,
    embed_dim=100,
    hidden_dim=128,
    num_layers=1,
    bidirectional=True,
):
    with open(os.path.join(model_dir, "vocab.json"), "r", encoding="utf-8") as f:
        word2id = json.load(f)
    vocab_size = len(word2id)

    model = RNNClassifier(
        vocab_size=vocab_size,
        embed_dim=embed_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        num_classes=2,
        bidirectional=bidirectional,
    )
    state = torch.load(os.path.join(model_dir, "model.pt"), map_location=device)
    model.load_state_dict(state)
    return model, word2id

def predict_one(text, model, word2id, max_len=80):
    model.eval()
    text_clean = preprocess_text_basic(text)
    ids = encode_sentence(text_clean, word2id, max_len)
    input_ids = torch.tensor([ids], dtype=torch.long).to(device)

    with torch.no_grad():
        logits = model(input_ids)
        probs = F.softmax(logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
    return pred, probs[0].cpu().numpy()

MODEL_DIR = "/content/rnn_nsmc_model"
model_loaded, vocab_loaded = load_vocab_and_model(MODEL_DIR)
model_loaded.to(device)

while True:
    text = input("리뷰 입력 (q 입력 시 종료): ").strip()
    if text.lower() == "q":
        break
    if not text:
        print("공백입니다.\n")
        continue

    pred, probs = predict_one(text, model_loaded, vocab_loaded, max_len=MAX_LEN)
    label = "긍정" if pred == 1 else "부정"
    print(f"리뷰: {text}")
    print(f"예측: {label} (부정={probs[0]:.3f}, 긍정={probs[1]:.3f})\n")

리뷰 입력 (q 입력 시 종료):  리얼 미국판 홍길동 ㅇㅈ
리뷰: 리얼 미국판 홍길동 ㅇㅈ
예측: 부정 (부정=0.729, 긍정=0.271)

리뷰 입력 (q 입력 시 종료):  호스맨 기다렸다고ㅠㅠ 꼭 보세요.. 시리즈 팬들도 충분히 만족할만함
리뷰: 호스맨 기다렸다고ㅠㅠ 꼭 보세요.. 시리즈 팬들도 충분히 만족할만함
예측: 긍정 (부정=0.095, 긍정=0.905)

리뷰 입력 (q 입력 시 종료):  이런 영화 기다려왔다... 전작보다 더 재밌어진 느낌..? 시간 가는줄 몰랐음
리뷰: 이런 영화 기다려왔다... 전작보다 더 재밌어진 느낌..? 시간 가는줄 몰랐음
예측: 긍정 (부정=0.214, 긍정=0.786)

리뷰 입력 (q 입력 시 종료):  구관이 명관이라더니 예전에 재밌게봤던 시리즈가 나와서 반가워서 봤는데 역시 존잼탱이네요
리뷰: 구관이 명관이라더니 예전에 재밌게봤던 시리즈가 나와서 반가워서 봤는데 역시 존잼탱이네요
예측: 긍정 (부정=0.038, 긍정=0.962)

리뷰 입력 (q 입력 시 종료):  이거 재밌다고 추천하는사람들은 억빠들인거임? 나만당할수 없다고 악질인거임?솔직히 나우유씨미 호스맨 볼라고 보는건데 호스맨 활약 전혀없고 왠 갑자기 뭔 꼬맹이하나가 나 설계자임 ㅇㅇ 하면서 깝치는건데 솔직히 스토리 너무 산으로 가는 느낌이고 그저 다음화 발사대용 0.5화 정도에 미공개판에 들어갈만한 내용들인데 길게 풀어놓은 느낌인데 범인 잡힌거 마무리도 ㅈㄴ 어이없이 갑자기 범인만 끌려가고 뭘 말하고싶은지 1도 모르겠음그냥 나 설계자임 ㅇㅇ 개쩔지? ㅇㅇ 하다가 끝나고 우리 할아범 그설계자놈때매 괜히죽고 솔직히 체인소맨 레제 2번 보는게 더 재밌음
리뷰: 이거 재밌다고 추천하는사람들은 억빠들인거임? 나만당할수 없다고 악질인거임?솔직히 나우유씨미 호스맨 볼라고 보는건데 호스맨 활약 전혀없고 왠 갑자기 뭔 꼬맹이하나가 나 설계자임 ㅇㅇ 하면서 깝치는건데 솔직히 스토리 너무 산으로 가는 느낌이고 그저 다음화 발사대용 0.5화 정도에 미공개판에 들어갈만