In [1]:
import pandas as pd

## 데이터 불러오기

In [2]:
df = pd.read_csv('https://github.com/euphoris/datasets/raw/master/imdb.zip')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
df.shape

(1000, 2)

## 단어 문서 행렬 만들기

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features= 1000, stop_words='english')
tdm = cv.fit_transform(df['review'])

## 데이터분할 (train, test split)

In [6]:
from sklearn.model_selection import train_test_split

X = tdm #단어 컬럼
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y , test_size = 0.2, random_state = 1111)


In [8]:
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(800, 1000) (800,)
(200, 1000) (200,)


## 로지스틱 회귀분석 with 텐서플로우

In [10]:
# 설치
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.3.1-cp38-cp38-macosx_10_14_x86_64.whl (165.2 MB)
[K     |████████████████████████████████| 165.2 MB 7.0 MB/s eta 0:00:01   |█                               | 5.6 MB 4.1 MB/s eta 0:00:40     |███                             | 15.2 MB 4.9 MB/s eta 0:00:31     |█████▎                          | 27.0 MB 3.3 MB/s eta 0:00:42     |█████████                       | 46.0 MB 4.9 MB/s eta 0:00:25     |██████████▍                     | 53.4 MB 10.5 MB/s eta 0:00:11     |████████████▉                   | 66.4 MB 2.4 MB/s eta 0:00:42     |██████████████▍                 | 74.5 MB 1.8 MB/s eta 0:00:51     |███████████████▏                | 78.0 MB 1.8 MB/s eta 0:00:49     |████████████████████▏           | 103.9 MB 3.5 MB/s eta 0:00:18     |████████████████████████▉       | 128.3 MB 3.1 MB/s eta 0:00:12
[?25hCollecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 5.5 MB/s 

In [20]:
import tensorflow as tf

### 모델 생성

In [21]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Dense(
    1,  # 출력은 긍정(1)/부정(0) 하나
    input_shape=(1000,), # 입력은 단어 수
    activation='sigmoid'))

In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 1001      
Total params: 1,001
Trainable params: 1,001
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

### 학습

In [31]:
model.fit(X_train.toarray(), y_train.values, batch_size=32, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x13b56ae20>

### 테스트

In [32]:
model.evaluate(X_test.toarray(),y_test)



[0.4993995428085327, 0.7950000166893005]

## 모델 저장

In [33]:
model.save('imdb-sentiment.krs')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: imdb-sentiment.krs/assets


## 단어별 가중치 확인

In [35]:
weight, bias = model.trainable_weights

In [36]:
word_weight = pd.DataFrame({
    '단어':cv.get_feature_names(),
    '가중치': weight.numpy().flat
    
})

In [39]:
#부정적인 단어
word_weight.sort_values('가중치').head(20)

Unnamed: 0,단어,가중치
69,bad,-0.884455
68,awful,-0.77436
986,worst,-0.703701
842,stupid,-0.69691
763,script,-0.615045
134,cheap,-0.593982
658,plot,-0.591006
950,wasted,-0.585696
827,started,-0.5821
413,holes,-0.578451


In [41]:
#긍정적인 단어
word_weight.sort_values('가중치',ascending=False).head(20)

Unnamed: 0,단어,가중치
529,love,0.713425
503,liked,0.694925
382,great,0.694204
594,nice,0.650863
530,loved,0.649958
976,wonderful,0.636237
292,excellent,0.622293
76,beautiful,0.594056
409,hilarious,0.580077
449,interesting,0.569849
