In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from gensim.models import KeyedVectors
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

word2vec_model = KeyedVectors.load_word2vec_format('Word2Vec_100.model') # 모델 불러오기
df = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt', sep = '\t')
df = df.dropna().reset_index(drop = True)
df.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.8/493.8 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.1 konlpy-0.6.0


In [None]:

import re
from konlpy.tag import Okt
okt = Okt()
stopword = pd.read_csv('stopword_kr.txt')['stopword'].to_list()

In [None]:
class MovieReviewDataset(Dataset):
  def __init__(self, dataframe, word2vec_model):
    self.dataframe = dataframe
    self.word2vec_model = word2vec_model
    self.vocab = list(word2vec_model.key_to_index)

  def __len__(self):
    return len(self.dataframe)

  def __getitem__(self, idx):
    review = self.dataframe.loc[idx, 'document']
    label = self.dataframe.loc[idx, 'label']
    sentence_vector = self.get_sentence_vector(review)
    return sentence_vector.to(device), torch.tensor(label).to(device)

  def get_sentence_vector(self, review):
    # 리뷰 전처리
    cleaned_text = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", str(review))
    tokenized_text = okt.morphs(cleaned_text, stem=True)
    stopword_removed_text = [word for word in tokenized_text if word not in stopword]

    # 단어 벡터화
    word_vectors = np.array(
        [self.word2vec_model[word] for word in stopword_removed_text if word in self.vocab]
    )

    # 벡터가 비어 있는지 확인
    if len(word_vectors) > 0:  # 크기 확인
        word_vectors = torch.FloatTensor(word_vectors)
        sentence_vector = torch.mean(word_vectors, dim=0)
    else:
        sentence_vector = torch.zeros(self.word2vec_model.vector_size)  # 빈 벡터 처리

    return sentence_vector


In [None]:
# word_vectors = []
# for word in stopword_removed_text:
#   if word in vocab:
#     word_vector = word2vec_model[word]
#     word_vectors.append(word_vector)

In [None]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)  # 크기 맞추기
    labels = torch.tensor(labels, dtype=torch.float32)
    return inputs_padded, labels


In [None]:
## dataset split
# set dataset
from torch.utils.data import random_split
dataset = MovieReviewDataset(df, word2vec_model)
BATCH_SIZE = 128

# 비율 설정
total_size = len(dataset)
TRAIN_RATIO, VALIATION_RATIO, TEST_RATIO = 0.6, 0.3, 0.1
train_size = int(total_size * TRAIN_RATIO)
valid_size = int(total_size * VALIATION_RATIO)
test_size = total_size - train_size - valid_size

# split dataset
train_dataset, valid_dataset, test_dataset = random_split(
    dataset, [train_size, valid_size, test_size])

# DataLoader
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
valid_loader = DataLoader(
    valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

In [None]:
class Classifier(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim):
    super(Classifier, self).__init__()
    self.fc1 = nn.Linear(embedding_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = torch.sigmoid(self.fc2(x)) # 이진분류 -> sigmoid
    return x

# 모델 파라미터 설정
criterion = nn.BCELoss() # 이진분류
embedding_dim = word2vec_model.vector_size
hidden_dim = 30
output_dim = 1

model = Classifier(embedding_dim, hidden_dim, output_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr =0.001)

In [None]:
import numpy as np
def train(model, train_loader, optimizer, criterion, num_epoch):
  for epoch in range(num_epoch):
    model.train()
    for sentence_vector, label in train_loader:
      optimizer.zero_grad()
      output = model(sentence_vector)
      loss = criterion(output.to(device), label.unsqueeze(1).float().to(device))
      loss.backward()
      optimizer.step()

    # validation
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
      for sentence_vector, label in valid_loader:
        output = model(sentence_vector)
        loss = criterion(output.to(device), label.unsqueeze(1).float().to(device))
        total_loss += loss.item()
        predicted = (output > 0.5).float()
        total_correct += (predicted.to(device) == label.unsqueeze(1).float().to(device)).sum().item()

    average_loss = total_loss / len(valid_loader)
    accuracy = total_correct / len(valid_loader.dataset)
    print(f'Epoch [{epoch+1}/{num_epoch}], Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}')

epoch = 30
train(model, train_loader, optimizer, criterion, epoch)

KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('model_epoch1.pt'))

  model.load_state_dict(torch.load('model_epoch1.pt'))


<All keys matched successfully>

In [None]:
# Model Test
def model_test():
    model.eval()  # 평가모드 설정
    total_loss = 0
    total_correct = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs.to(device), labels.unsqueeze(1).float().to(device))
            total_loss += loss.item()
            predict = (outputs > 0.5).float()
            total_correct += (predict.to(device)==labels.to(device)).sum().item()
    avg_loss = total_loss / len(valid_loader)
    accuracy = total_correct / len(valid_loader.dataset)
    print(f'Test Loss: {avg_loss}, Test Acc: {accuracy}')
model_test()

KeyboardInterrupt: 

In [None]:
text = '재밌다'
model.eval() # 평가모드 설정
with torch.no_grad():
    sentence_vector = dataset.get_sentence_vector(text).to(device)
    output = model(sentence_vector)
    prediction = (output > 0.5).float()
    if prediction.item() == 0:
        result = '부정'
    else:
        result = '긍정'
print(result)

긍정
