## 데이터 불러오기

In [1]:
import pandas as pd

In [2]:
nsmc = pd.read_csv('https://github.com/e9t/nsmc/raw/master/ratings_train.txt', sep='\t')

In [3]:
nsmc.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


## 토큰화

In [4]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 4.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91


In [13]:
with open('nsmc.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(nsmc.loc[:10000, 'document'].fillna('')))

In [14]:
from sentencepiece import SentencePieceTrainer
SentencePieceTrainer.Train('--input=nsmc.txt --model_prefix=nsmc --vocab_size=3000')

In [15]:
from sentencepiece import SentencePieceProcessor
sp = SentencePieceProcessor()
sp.Load("nsmc.model")

True

In [16]:
sp.encode_as_pieces(nsmc.loc[0, 'document'])

['▁아', '▁더빙', '..', '▁진짜', '▁짜증나', '네요', '▁목소리']

## 정리

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer(lowercase=False, tokenizer=sp.encode_as_pieces)

In [20]:
tdm = cv.fit_transform(nsmc.loc[:10000, 'document'])

In [21]:
from sklearn.model_selection import train_test_split

In [24]:
x = tdm
y = nsmc.loc[:10000, 'label'].values

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## 모형

In [26]:
import tensorflow as tf

In [33]:
NUM_WORDS = x_train.shape[1]

In [34]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(
        1, 
        input_shape=(NUM_WORDS,), 
        activation='sigmoid',
        kernel_regularizer=tf.keras.regularizers.l2(0.001))
])

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 3180      
Total params: 3,180
Trainable params: 3,180
Non-trainable params: 0
_________________________________________________________________


## 학습

In [36]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
model.fit(x_train.toarray(), y_train, epochs=100, validation_split=0.1,
          callbacks=[tf.keras.callbacks.EarlyStopping()])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


<tensorflow.python.keras.callbacks.History at 0x7efda2dbb780>

## 토큰별 가중치

In [38]:
weights, _ = model.trainable_weights

In [39]:
import pandas as pd
token_weight = pd.DataFrame({'토큰': cv.get_feature_names(), '가중치': weights.numpy().flat})

In [40]:
token_weight.sort_values('가중치').head()

Unnamed: 0,토큰,가중치
682,▁쓰레기,-0.725988
930,▁재미없,-0.624085
689,▁아깝다,-0.588454
541,▁별로,-0.551379
1072,▁최악의,-0.521486


In [41]:
token_weight.sort_values('가중치').tail()

Unnamed: 0,토큰,가중치
942,▁재밌게,0.514918
935,▁재미있,0.529214
85,^^,0.690314
1069,▁최고의,0.718136
1067,▁최고,0.770125
