In [1]:
import pandas as pd

## 데이터 열기

In [2]:
nsmc = pd.read_csv('https://github.com/e9t/nsmc/raw/master/ratings_train.txt', sep='\t')

In [4]:
nsmc.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [5]:
nsmc.shape

(150000, 3)

In [6]:
# 데이터 2000개로 줄이기
nsmc = nsmc.loc[:1999]
nsmc.shape

(2000, 3)

## 준단어 토큰화

In [7]:
with open('nsmc.txt','w',encoding='utf8') as f:
    f.write('\n'.join(nsmc['document']))

In [8]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.94-cp38-cp38-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 1.4 MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94
Note: you may need to restart the kernel to use updated packages.


In [11]:
# 학습시켜서 nsmc로 저장
from sentencepiece import SentencePieceTrainer
SentencePieceTrainer.Train('--input=nsmc.txt --model_prefix=nsmc --vocab_size=3000')

In [12]:
from sentencepiece import SentencePieceProcessor
sp = SentencePieceProcessor()
sp.Load("nsmc.model")

True

## 확인해보기

In [13]:
nsmc.loc[0,'document']

'아 더빙.. 진짜 짜증나네요 목소리'

In [14]:
sp.encode_as_pieces(nsmc.loc[0,'document'])

['▁아', '▁더빙', '..', '▁진짜', '▁짜증나', '네요', '▁목소리']

## 벡터화

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase=False, tokenizer=sp.encode_as_pieces)
tdm=cv.fit_transform(nsmc['document'])

## train, test split

In [18]:
from sklearn.model_selection import train_test_split
X = tdm
y = nsmc['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=11)


## 모델링

In [19]:
import tensorflow as tf

In [22]:
X_train.shape

(1600, 3019)

In [27]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(
        1, 
        input_shape=(3019,), 
        activation='sigmoid',
        kernel_regularizer=tf.keras.regularizers.l2(0.001)))

In [28]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 1)                 3020      
Total params: 3,020
Trainable params: 3,020
Non-trainable params: 0
_________________________________________________________________


## 학습

In [30]:
# adam: 경사하강법중 하나
# cossentropy: 손실함수
# accuracy: 몇개중에 몇개 맞았는지
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [32]:
model.fit(X_train.toarray(), y_train.values, epochs=100, validation_split=0.1,
          callbacks=[tf.keras.callbacks.EarlyStopping()])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


<tensorflow.python.keras.callbacks.History at 0x14a1e5b20>

## 최종 성능 측정

In [36]:
model.evaluate(X_test.toarray(),y_test)



[0.6105234026908875, 0.7225000262260437]

## 가중치 확인

In [43]:
weights, _=model.trainable_weights

In [44]:
import numpy
token_weight = pd.DataFrame({'토큰': cv.get_feature_names(), '가중치': weights.numpy().flat})

In [49]:
token_weight.sort_values('가중치').head(20)

Unnamed: 0,토큰,가중치
607,▁별로,-0.35778
511,▁망,-0.344924
863,▁없고,-0.330531
759,▁실망,-0.328871
774,▁쓰레기,-0.328093
779,▁아까운,-0.321137
443,▁돈,-0.319618
1000,▁일본,-0.312998
1054,▁재미없다,-0.309839
1568,냐,-0.308651


In [48]:
token_weight.sort_values('가중치',ascending=False).head(20)

Unnamed: 0,토큰,가중치
1202,▁최고의,0.481892
1,!!,0.461479
0,!,0.433039
1201,▁최고,0.419698
1036,▁잘,0.344245
89,^^,0.336882
536,▁명작,0.331261
2687,찮,0.330589
1094,▁정말,0.328217
1507,꽤,0.32366


## 새로운 데이터에 적용

In [50]:
new_data = ['뽀로로는 정말 재미있는 영화다.', '이런 영화를 만들다니 감독은 무슨 생각이냐?']

In [52]:
cv.tokenizer = sp.encode_as_pieces
x_new = cv.transform(new_data)

In [53]:
# 각 데이터가 긍정(1) 일 확률
model.predict(x_new.toarray())

array([[0.63866615],
       [0.33196622]], dtype=float32)