In [1]:
!pip install cssselect tokenizers

Collecting cssselect
  Downloading https://files.pythonhosted.org/packages/3b/d4/3b5c17f00cce85b9a1e6f91096e1cc8e8ede2e1be8e96b87ce1ed09e92c5/cssselect-1.1.0-py2.py3-none-any.whl
Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/6b/15/1c026f3aeafd26db30cb633d9915aae666a415179afa5943263e5dbd55a6/tokenizers-0.8.0-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 3.9MB/s 
[?25hInstalling collected packages: cssselect, tokenizers
Successfully installed cssselect-1.1.0 tokenizers-0.8.0


## 영화평 수집

In [10]:
import requests
import lxml.html
import tqdm
import re

In [9]:
url = 'https://movie.daum.net/moviedb/grade?movieId=73253&type=netizen&page={}'

In [11]:
data = []

for page in tqdm.tnrange(1, 68):
    res = requests.get(url.format(page))
    root = lxml.html.fromstring(res.text)

    scores = root.cssselect('em.emph_grade')  # 별점
    reviews = root.cssselect('p.desc_review') # 영화평

    for score, review in zip(scores, reviews):  # 별점과 영화평을 짝을 짓는다
        content = review.text_content().strip()
        content = re.sub(r'\s+', ' ', content)
        data.append((score.text, content))

  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))




In [53]:
import pandas as pd
df = pd.DataFrame(data, columns=['sentiment', 'review'])

In [54]:
df.iloc[0:3,]

Unnamed: 0,sentiment,review
0,4,
1,5,"21세기 대부분의 괴수 영화들이 그렇듯이 볼거리와 영화적 스케일은 출중하나, 각본과..."
2,10,하고싶어요안될까요제발보내드립니다!네


In [14]:
df.shape

(662, 2)

## 준단어 토큰화

In [55]:
with open('review.txt', 'w') as f:
    f.write('\n'.join(df.review))

In [17]:
 from tokenizers import SentencePieceBPETokenizer

In [56]:
tokenizer = SentencePieceBPETokenizer()

In [57]:
tokenizer.train('review.txt', vocab_size=2000)

In [58]:
encoded = tokenizer.encode('괴수 나오는 영화 재밌다!')

In [59]:
encoded.tokens

['▁괴수', '▁나오', '는', '▁영화', '▁재밌', '다!']

## 단어 문서 행렬

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
cv = CountVectorizer(tokenizer=lambda x: tokenizer.encode(x).tokens)

In [61]:
tdm = cv.fit_transform(df['review'])

In [62]:
word_freq = pd.DataFrame({
    '단어': cv.get_feature_names(),
    '빈도': tdm.sum(axis=0).flat
})

In [65]:
word_freq.sort_values('빈도', ascending=False).head(10)

Unnamed: 0,단어,빈도
91,▁,613
1546,이,282
1033,도,176
1545,의,162
773,가,156
1538,을,154
980,는,152
14,.,151
1468,에,148
824,고,137


## 감성 분석

In [66]:
import tensorflow as tf

In [68]:
_, NUM_WORDS = tdm.shape

In [69]:
model = tf.keras.models.Sequential()
model.add(
    tf.keras.layers.Dense(
        1, 
        input_shape=(NUM_WORDS,), 
        activation='sigmoid',
        kernel_regularizer=tf.keras.regularizers.l2(0.001)))

In [70]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

0       4
1       5
2      10
3       0
4       2
       ..
657    10
658    10
659    10
660    10
661     5
Name: sentiment, Length: 662, dtype: int64

In [74]:
x = tdm.toarray()
y = (df.sentiment.astype(int) > 5).astype(int)

In [None]:
model.fit(x, y, epochs=100, validation_split=0.1, callbacks=[tf.keras.callbacks.EarlyStopping()])

## 가중치 확인

In [76]:
weights, _ = model.trainable_weights

In [77]:
token_weight = pd.DataFrame({'토큰': cv.get_feature_names(), '가중치': weights.numpy().flat})

In [81]:
token_weight.sort_values('가중치').head(10)

Unnamed: 0,토큰,가중치
266,▁돈,-0.291141
121,▁ᅲᅲ,-0.287512
447,▁아까워,-0.264941
568,▁이렇게,-0.24782
446,▁아까,-0.246646
986,능,-0.242427
437,▁싸,-0.242119
442,▁쓰레기,-0.240085
146,▁건,-0.224563
229,▁낫,-0.222865


In [80]:
token_weight.sort_values('가중치').tail(10)

Unnamed: 0,토큰,가중치
306,▁만족,0.230813
165,▁고질라의,0.231593
651,▁즐,0.238916
690,▁충분히,0.239152
218,▁나름,0.247696
1465,었,0.251758
172,▁괜찮,0.261776
640,▁좋았,0.280364
605,▁재미있,0.282193
594,▁잘,0.350881


## 혼동 행렬

In [84]:
 p = model.predict_classes(x)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [86]:
from sklearn.metrics import confusion_matrix

In [87]:
confusion_matrix(y, p)

array([[336,  10],
       [ 41, 275]])