In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter

# 데이터 로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

print('Train data len: ', len(train_data))
train_data[:5]

train_data.drop_duplicates(subset=['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거

train_data.groupby('label').size()

train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data[:5]

train_data['document'] = train_data['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
train_data['document'].replace('', np.nan, inplace=True)
print(train_data.isnull().sum())

In [None]:
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']

okt = Okt()
X_train = []
for sentence in tqdm(train_data['document']):
    tokenized_sentences = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed = [word for word in tokenized_sentences if not word in stopwords] # 불용어 제거
    X_train.append(stopwords_removed)
print(X_train[:3])

In [None]:
X_test = []
for sentence in tqdm(test_data['document']):
    tokenized_sentences = okt.morphs(str(sentence), stem=True) # 토큰화
    stopwords_removed = [word for word in tokenized_sentences if not word in stopwords] # 불용어 제거
    X_test.append(stopwords_removed)

In [None]:
def tokenize(x_train, y_train, x_val, y_val):
  word_list = []
  
  for sentence in x_train:
    for word in sentence:
      word_list.append(word)
      
  corpus = Counter(word_list)
  corpus_ = sorted(corpus, key=corpus.get, reverse=True)[:10000]
  onehot_dict = {w: i+1 for i, w in enumerate(corpus_)}
  
  final_list_train, final_list_test = [], []
  for sentence in x_train:
    final_list_train.append([onehot_dict[word] for word in sentence
                             if word in sentence in onehot_dict.keys()])
  for sentence in x_val:
    final_list_test.append([onehot_dict[word] for word in sentence
                            if word in sentence in onehot_dict.keys()])
  return np.array(final_list_train), np.array(y_train), np.array(final_list_test), np.array(y_val)

In [None]:
x_train, y_train, x_test, y_test, vocab = tokenize(X_train, train_data['label'], X_test, test_data['label'])
rev_len = [len(x) for x in x_train]
pd.Series(rev_len).hist()
plt.show()
pd.Series(rev_len).describe()