In [38]:
!pip install konlpy



In [39]:
import pandas as pd
from konlpy.tag import Okt
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [40]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [41]:
# 중복제거
train_data.drop_duplicates(subset=['document'], inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)

In [42]:
# 결측치 제거
train_data.dropna(subset=['document'], inplace=True)
test_data.dropna(subset=['document'], inplace=True)

In [43]:
# 특수문자 제거
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

# 불용어 처리
with open('stopword_kr.txt', 'r', encoding='utf-8') as file:
    stopwords = [line.strip() for line in file]

# 토큰화
okt = Okt()
sample_text = train_data['document'].iloc[0]
okt.morphs(sample_text, stem = True) # example

# 토큰화 및 불용어 제거
def preprocess_text(data, okt, stopwords):
    processed = []
    for sentence in tqdm(data):
        tokenized_sentence = okt.morphs(sentence, stem=True)
        filtered_sentence = [word for word in tokenized_sentence if word not in stopwords]
        processed.append(filtered_sentence)
    return processed

In [44]:
# X_train = preprocess_text(train_data['document'], okt, stopwords)
# X_test = preprocess_text(test_data['document'], okt, stopwords)

# y_train = torch.tensor(train_data['label'].values, dtype=torch.float32)
# y_test = torch.tensor(test_data['label'].values, dtype=torch.float32)

In [45]:
X_train_path = "X_train.pt"
X_test_path = "X_test.pt"
y_train_path = "y_train.pt"
y_test_path = "y_test.pt"

# 데이터 로드 함수
def load_tensor_data(path):
    tensor = torch.load(path, weights_only=True)
    print(f"Loaded tensor from {path}")
    return tensor

# 필요한 경우 y_train, y_test 다시 로드
X_train = load_tensor_data(X_train_path)
X_test = load_tensor_data(X_test_path)
y_train = load_tensor_data(y_train_path)
y_test = load_tensor_data(y_test_path)

Loaded tensor from X_train.pt
Loaded tensor from X_test.pt
Loaded tensor from y_train.pt
Loaded tensor from y_test.pt


In [46]:
# 단어 사전 생성
from collections import Counter

# 모든 문장의 단어를 하나의 리스트로 합치기
all_words = [word for sentence in X_train for word in sentence]

# 단어 빈도 계산
word_counts = Counter(all_words)

# 단어 사전 생성
vocab = {"<PAD>": 0, "<UNK>": 1}
vocab.update({word: idx + 2 for idx, (word, _) in enumerate(word_counts.items())})

vocab_size = len(vocab)

In [47]:
import torch
from torch.nn.utils.rnn import pad_sequence

def encode_and_pad(data, vocab, max_len=30):
    # 정수 인코딩
    encoded = [torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in sentence], dtype=torch.long) for sentence in data]

    # 패딩 처리 (자동으로 길이 맞추기)
    padded = pad_sequence(encoded, batch_first=True, padding_value=vocab["<PAD>"])

    # 자르기 (최대 길이 제한)
    return padded[:, :max_len] if max_len else padded

In [48]:
# encoded = []
# for sentence in data:
#     encoded_sentence = []
#     for word in sentence:
#         # vocab에서 단어를 찾고, 없으면 <UNK>로 처리
#         encoded_sentence.append(vocab.get(word, vocab["<UNK>"]))
#     encoded.append(encoded_sentence)

In [49]:
X_train_padded = encode_and_pad(X_train, vocab)
X_test_padded = encode_and_pad(X_test, vocab)

In [50]:
# PyTorch Dataset 생성
class TextDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

train_dataset = TextDataset(X_train_padded, y_train)
test_dataset = TextDataset(X_test_padded, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [51]:
# 모델 정의
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)
        self.fc = nn.Linear(hidden_units, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        x = self.fc(hidden[-1])
        return self.sigmoid(x)

embedding_dim = 100
hidden_units = 128

model = SentimentModel(vocab_size, embedding_dim, hidden_units)
criterion = nn.BCELoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

In [52]:
### 양방향 모델
# class SentimentModel(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_units):
#         super(SentimentModel, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_units * 2, 1)  # Bidirectional이므로 hidden_units * 2

#     def forward(self, x):
#         x = self.embedding(x)
#         _, (hidden, _) = self.lstm(x)  # hidden: [2, batch_size, hidden_units]
#         hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # 양방향 결합
#         x = self.fc(hidden)  # [batch_size, 1]
#         return x

# embedding_dim = 100
# hidden_units = 128

# model = SentimentModel(vocab_size, embedding_dim, hidden_units)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.RMSprop(model.parameters(), lr=0.001)

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.BCELoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

# 학습 루프
epochs = 15
for epoch in range(epochs):
  model.train()
  epoch_loss = 0
  for inputs, labels in train_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(inputs).squeeze()
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader)}")

KeyboardInterrupt: 

In [None]:
# # 모델 저장 경로
# model_path = "sentiment_model.pth"

# # 모델 로드 함수
# def load_model(model, path):
#     model.load_state_dict(torch.load(path))
#     model.eval()  # 모델을 평가 모드로 설정
#     print(f"Model loaded from {path}")
#     return model

# # 모델 로드
# loaded_model = SentimentModel(vocab_size, embedding_dim, hidden_units)
# loaded_model = load_model(loaded_model, model_path)

In [None]:
# 평가
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs).squeeze()
        predictions = (outputs > 0.5).float()
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
    print(f"Test Accuracy: {correct / total:.4f}")

In [None]:
import re
max_len = 30

# 예측 함수 정의
def predict_sentiment(sentence, model, vocab, max_len, okt, stopwords):
    # 1. 입력 문장 전처리
    sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', sentence)  # 한글과 공백만 남기기
    tokenized_sentence = okt.morphs(sentence, stem=True)  # 형태소 분석
    filtered_sentence = [word for word in tokenized_sentence if word not in stopwords]  # 불용어 제거

    # 2. 정수 인코딩 및 패딩
    encoded = [vocab.get(word, vocab["<UNK>"]) for word in filtered_sentence]
    padded = [encoded[:max_len] + [vocab["<PAD>"]] * (max_len - len(encoded))]

    # 3. 모델에 입력하여 예측
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor(padded, dtype=torch.long)
        output = model(input_tensor).item()

    # 4. 예측 결과 출력
    if output > 0.5:
        print(f"'{sentence}' -> {output * 100:.2f}% 확률로 긍정 리뷰입니다.")
    else:
        print(f"'{sentence}' -> {(1 - output) * 100:.2f}% 확률로 부정 리뷰입니다.")

# 예측 실행
predict_sentiment("ㅋㅋㅋ", model, vocab, max_len, okt, stopwords)
predict_sentiment("이 영화 핵노잼 ㅠㅠ", model, vocab, max_len, okt, stopwords)