In [2]:
import keras
import tensorflow as tf
print(keras.__version__)
print(tf.__version__)

2.6.0
2.6.1


In [3]:
import numpy as np

# 하나의 원소가 샘플. 하나의 문장.
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

In [4]:
# 단어:인덱스
token_index = {}

for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
            
max_length = 10
token_index

{'The': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'the': 5,
 'mat.': 6,
 'dog': 7,
 'ate': 8,
 'my': 9,
 'homework.': 10}

In [5]:
# 결과를 저장할 배열(0으로 이루어진 벡터)
# max_length는 사용할 단어수
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
print(results.shape, results)

(2, 10, 11) [[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [6]:
token_index.get(word)

10

In [7]:
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.  # 행, 열, 인덱스를 1로 치환 

### 케라스 활용 원핫 인코딩

In [8]:
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

In [9]:
# 가장 빈도가 높은 1,000개의 단어만 선택하도록 Tokenizer 객체를 만듭니다.
tokenizer = Tokenizer(num_words=1000)

# 단어 인덱스를 구축합니다.
tokenizer.fit_on_texts(samples)

# 문자열을 정수 인덱스의 리스트로 변환합니다.
sequences = tokenizer.texts_to_sequences(samples)
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [10]:
one_hot = tokenizer.texts_to_matrix(samples, mode='binary')
one_hot.shape, one_hot

((2, 1000),
 array([[0., 1., 1., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]]))

In [11]:
# 계산된 단어 인덱스를 구합니다.
word_index = tokenizer.word_index
print(word_index)
print('Found %s unique tokens.' % len(word_index))

{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}
Found 9 unique tokens.


In [12]:
# 계산된 단어 인덱스를 구합니다.
word_index = tokenizer.word_index
print(word_index)
print('Found %s unique tokens.' % len(word_index))

{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}
Found 9 unique tokens.


In [14]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 단어를 크기가 1,000인 벡터로 저장합니다.
# 1,000개(또는 그이상)의 단어가 있다면 해싱 충돌이 늘어나고 인코딩의 정확도가 감소될 것입니다
dimensionality = 1000
max_length = 10

In [15]:
results = np.zeros((len(samples), max_length, dimensionality))
print(results.shape)

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        # 단어를 해싱하여 0과 1,000 사이의 랜덤한 정수 인덱스로 변환합니다.
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.

(2, 10, 1000)


In [16]:
from keras.layers import Embedding

In [17]:
embedding_layer = Embedding(1000, 64)

print( type(embedding_layer), embedding_layer)

<class 'keras.layers.embeddings.Embedding'> <keras.layers.embeddings.Embedding object at 0x000001E2E68EB7F0>


In [18]:
from keras.datasets import imdb
from keras import preprocessing

In [19]:
# 특성으로 사용할 단어의 수
max_features = 10000

# 정수 리스트로 데이터를 로드합니다.
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [20]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [21]:
# 리스트를 (samples, maxlen) 크기의 2D 정수 텐서로 변환합니다.
maxlen = 50

X_train_n = preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_test_n = preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

In [22]:
print("변경 전 : " , X_train.shape, X_test.shape)
print("변경 후 : ", X_train_n.shape, X_test_n.shape)

변경 전 :  (25000,) (25000,)
변경 후 :  (25000, 50) (25000, 50)


In [23]:
X_train_n[0]

array([2071,   56,   26,  141,    6,  194, 7486,   18,    4,  226,   22,
         21,  134,  476,   26,  480,    5,  144,   30, 5535,   18,   51,
         36,   28,  224,   92,   25,  104,    4,  226,   65,   16,   38,
       1334,   88,   12,   16,  283,    5,   16, 4472,  113,  103,   32,
         15,   16, 5345,   19,  178,   32])

In [24]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

In [26]:
model = Sequential()

model.add(Embedding(10000, 8, input_length=maxlen)) # None, 50, 8
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 401       
Total params: 80,401
Trainable params: 80,401
Non-trainable params: 0
_________________________________________________________________


In [27]:
%%time 

# 분류기를 추가합니다.
model.compile(optimizer='rmsprop', 
              loss='binary_crossentropy', 
              metrics=['acc'])

history = model.fit(X_train_n, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 19.4 s


### 간단한 RNN 신경망 구현해보기

In [28]:
import tensorflow as tf
import keras
import numpy as np

In [35]:
for i in range(10):
    lst = list(range(i, i+5))
    print(lst)

[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]
[6, 7, 8, 9, 10]
[7, 8, 9, 10, 11]
[8, 9, 10, 11, 12]
[9, 10, 11, 12, 13]


In [36]:
X = []
Y = []

for i in range(10):
    lst = list( range(i, i+5) )
    X.append( [ [ c/10 ]  for c in lst ] ) # 입력
    Y.append( (i+5)/10 )                # 출력
    
X = np.array(X)
Y = np.array(Y)

print( X.shape, Y.shape )
print( X[0], Y[0])

(10, 5, 1) (10,)
[[0. ]
 [0.1]
 [0.2]
 [0.3]
 [0.4]] 0.5


In [37]:
for i in range(len(X)):
    print(X[i], Y[i])
print(X.shape, Y.shape)

[[0. ]
 [0.1]
 [0.2]
 [0.3]
 [0.4]] 0.5
[[0.1]
 [0.2]
 [0.3]
 [0.4]
 [0.5]] 0.6
[[0.2]
 [0.3]
 [0.4]
 [0.5]
 [0.6]] 0.7
[[0.3]
 [0.4]
 [0.5]
 [0.6]
 [0.7]] 0.8
[[0.4]
 [0.5]
 [0.6]
 [0.7]
 [0.8]] 0.9
[[0.5]
 [0.6]
 [0.7]
 [0.8]
 [0.9]] 1.0
[[0.6]
 [0.7]
 [0.8]
 [0.9]
 [1. ]] 1.1
[[0.7]
 [0.8]
 [0.9]
 [1. ]
 [1.1]] 1.2
[[0.8]
 [0.9]
 [1. ]
 [1.1]
 [1.2]] 1.3
[[0.9]
 [1. ]
 [1.1]
 [1.2]
 [1.3]] 1.4
(10, 5, 1) (10,)


In [42]:
model = tf.keras.Sequential( [
    tf.keras.layers.SimpleRNN(units=10, return_sequences=True, input_shape=[5,1]),
    tf.keras.layers.Dense(1)
])

In [43]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 5, 10)             120       
_________________________________________________________________
dense_2 (Dense)              (None, 5, 1)              11        
Total params: 131
Trainable params: 131
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 5, 10)             120       
_________________________________________________________________
dense_2 (Dense)              (None, 5, 1)              11        
Total params: 131
Trainable params: 131
Non-trainable params: 0
_________________________________________________________________


In [45]:
model.fit(X, Y, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1e2f18b2610>

In [46]:
pred = model.predict(X)
pred

array([[[0.17564176],
        [0.28943437],
        [0.4432118 ],
        [0.53082097],
        [0.62173975]],

       [[0.23816256],
        [0.42067087],
        [0.6111151 ],
        [0.68228084],
        [0.74614877]],

       [[0.29971838],
        [0.5469744 ],
        [0.76468015],
        [0.81098646],
        [0.84392554]],

       [[0.35981113],
        [0.66716874],
        [0.90190244],
        [0.9177513 ],
        [0.9205146 ]],

       [[0.41798785],
        [0.78043604],
        [1.0223774 ],
        [1.0058389 ],
        [0.98249733]],

       [[0.47385272],
        [0.88628924],
        [1.1267792 ],
        [1.079233  ],
        [1.0352317 ]],

       [[0.5270754 ],
        [0.98452085],
        [1.2163645 ],
        [1.1415203 ],
        [1.0823399 ]],

       [[0.577395  ],
        [1.0751423 ],
        [1.2926161 ],
        [1.1954479 ],
        [1.125967  ]],

       [[0.62462074],
        [1.158325  ],
        [1.3570338 ],
        [1.2429218 ],
        [1.16720