<a href="https://colab.research.google.com/github/dhdbsrlw/Introduction-to-DL-for-NLP/blob/main/RNN_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, LSTM, Bidirectional

In [3]:
# 임의의 입력
train_X = [[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]
print(np.shape(train_X))

# 2D 텐서를 3D로 변환
train_X = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]
train_X = np.array(train_X, dtype=np.float32)

# (batch_size - RNN이 학습하는 데이터의 양, timesteps, input_dim) 에 해당
print(train_X.shape)

(4, 5)
(1, 4, 5)


In [4]:
rnn = SimpleRNN(3, return_sequences=True)
hidden_states = rnn(train_X)

print('hidden states: {}, shape: {}'.format(hidden_states, hidden_states.shape))

hidden states: [[[ 0.47286037  0.9873893   0.9989129 ]
  [-0.12311106  0.98774064  0.99569094]
  [-0.9446929   0.9907793   0.9955858 ]
  [-0.6548631   0.9820767   0.9758106 ]]], shape: (1, 4, 3)


# LSTM

In [6]:
lstm = LSTM(3, return_sequences=True, return_state=True)
hidden_state, last_state, last_cell_state = lstm(train_X)

print('hidden state : {}, shape: {}'.format(hidden_state, hidden_state.shape))
print('last hidden state : {}, shape: {}'.format(last_state, last_state.shape))
print('last cell state : {}, shape: {}'.format(last_cell_state, last_cell_state.shape))

hidden state : [[[0.31004924 0.09567745 0.13988107]
  [0.53191364 0.26317057 0.33637616]
  [0.3652793  0.15274005 0.36170208]
  [0.5129372  0.22683617 0.3951583 ]]], shape: (1, 4, 3)
last hidden state : [[0.5129372  0.22683617 0.3951583 ]], shape: (1, 3)
last cell state : [[0.7573959  0.40783477 2.1225896 ]], shape: (1, 3)


In [7]:
# Bidirectional LSTM

# 출력되는 은닉상태의 값 고정
k_init = tf.keras.initializers.Constant(value=0.1)
b_init = tf.keras.initializers.Constant(value=0)
r_init = tf.keras.initializers.Constant(value=0.1)

bilstm = Bidirectional(LSTM(3, return_sequences=False, return_state=True,
                            kernel_initializer=k_init, bias_initializer=b_init, recurrent_initializer=r_init))

hidden_states, forward_h, forward_c, backward_h, backward_c = bilstm(train_X)
print('hidden states : {}, shape: {}'.format(hidden_states, hidden_states.shape))
print('forward state : {}, shape: {}'.format(forward_h, forward_h.shape))
print('backward state : {}, shape: {}'.format(backward_h, backward_h.shape))

hidden states : [[0.6303138 0.6303138 0.6303138 0.7038734 0.7038734 0.7038734]], shape: (1, 6)
forward state : [[0.6303138 0.6303138 0.6303138]], shape: (1, 3)
backward state : [[0.7038734 0.7038734 0.7038734]], shape: (1, 3)


# RNN을 이용한 텍스트 생성기

In [8]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [9]:
# 예제
text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1 # 패딩값 고려
print('단어 집합의 크기: %d' % vocab_size)
print(tokenizer.word_index)

단어 집합의 크기: 12
{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}


In [19]:
# 훈련데이터 생성

sequences = list()
for line in text.split('\n'):
  encoded = tokenizer.texts_to_sequences([line])[0] # 텍스트의 말뭉치 토큰을 일련의 정수로 변환
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(sequences))
print(sequences)


# 패딩(Padding)
max_len = max(len(l) for l in sequences)
print('샘플의 최대 길이: {}'.format(max_len))
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences)

# 마지막 단어를 레이블로 분리 (단어 예측 - 생성 태스크 이므로)
sequences = np.array(sequences)
X = sequences[:, :-1]
Y = sequences[:, -1]

# 레이블에 대해 원-핫 인코딩 수행
y = to_categorical(Y, num_classes=vocab_size)
print(y)

학습에 사용할 샘플의 개수: 11
[[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]
샘플의 최대 길이: 6
[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [21]:
# 모델 설계

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

'''
모델 특징

하이퍼 파라미터인 임베딩 벡터의 차원은 10, 은닉상태의 크기는 32로 설정
다대일 구조의 RNN - 일종의 다중클래스 분류 문제 수행
이에 따라 소프트맥스 활성화함수 및 크로스 엔트로피 손실 함수 사용
200에포크 수행
'''

embedding_dim = 10
hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(SimpleRNN(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)


Epoch 1/200
1/1 - 2s - loss: 2.4818 - accuracy: 0.0909 - 2s/epoch - 2s/step
Epoch 2/200
1/1 - 0s - loss: 2.4689 - accuracy: 0.0909 - 9ms/epoch - 9ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4558 - accuracy: 0.0909 - 8ms/epoch - 8ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4425 - accuracy: 0.1818 - 8ms/epoch - 8ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4288 - accuracy: 0.1818 - 9ms/epoch - 9ms/step
Epoch 6/200
1/1 - 0s - loss: 2.4148 - accuracy: 0.1818 - 9ms/epoch - 9ms/step
Epoch 7/200
1/1 - 0s - loss: 2.4002 - accuracy: 0.1818 - 9ms/epoch - 9ms/step
Epoch 8/200
1/1 - 0s - loss: 2.3852 - accuracy: 0.1818 - 8ms/epoch - 8ms/step
Epoch 9/200
1/1 - 0s - loss: 2.3695 - accuracy: 0.1818 - 9ms/epoch - 9ms/step
Epoch 10/200
1/1 - 0s - loss: 2.3532 - accuracy: 0.0909 - 8ms/epoch - 8ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3362 - accuracy: 0.1818 - 9ms/epoch - 9ms/step
Epoch 12/200
1/1 - 0s - loss: 2.3185 - accuracy: 0.1818 - 9ms/epoch - 9ms/step
Epoch 13/200
1/1 - 0s - loss: 2.3001 - accuracy: 0.3636 - 8ms/e

<keras.callbacks.History at 0x7f3862415870>

In [23]:
# 모델 평가를 위한 문장 생성 함수
def sent_generation(model, tokenizer, current_word, n):
  init_word = current_word
  sentence = ''

  # n 번 반복
  for _ in range(n):

    # 현재 단어에 대한 정수 인코딩 및 패딩
    encoded = tokenizer.texts_to_sequences([current_word])[0]
    encoded = pad_sequences([encoded], maxlen=5, padding='pre')

    # 입력한 X(현재 단어)에 대해서 Y를 예측하고 이를 result에 저장
    result = model.predict(encoded, verbose=0)
    result = np.argmax(result, axis=1) # 최대값의 인덱스 위치 반환

    for word, index in tokenizer.word_index.items():
      # 예측한 단어와 인덱스가 동일한 단어가 존재한다면
      if index == result:
        break

    current_word = current_word + ' ' + word
    sentence = sentence + ' ' + word

  sentence = init_word + sentence
  return sentence

In [24]:
# 실험
print(sent_generation(model, tokenizer, '경마장에', 4))

경마장에 있는 말이 뛰고 있다


# LSTM을 이용한 텍스트 생성기

In [2]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


df = pd.read_csv('ArticlesApril2018.csv')
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [5]:
print('열의 개수: ', len(df.columns))
print(df.columns)

# NULL 값 여부 확인
print(df['headline'].isnull().values.any())

열의 개수:  15
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')
False


In [8]:
headline = []

# 헤드라인의 값들을 리스트로 저장
headline.extend(list(df.headline.values))
headline[:5]

# 노이즈 데이터 제거 (Unknown)
print('노이즈 제거 전 샘플의 개수 : {}'.format(len(headline)))
headline = [word for word in headline if word != "Unknown"]
print('노이즈 제거 후 샘플의 개수 : {}'.format(len(headline)))

# 데이터 전처리 (구두점 제거 및 모든 단어 소문자화)
def repreprocessing(raw_sentence):
  preprocessed_sentence = raw_sentence.encode("utf8").decode("ascii", 'ignore')
  # 구두점 제거와 동시에 소문자화
  return ''.join(word for word in preprocessed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

노이즈 제거 전 샘플의 개수 : 1324
노이즈 제거 후 샘플의 개수 : 1214


['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [12]:
# 단어 집합 생성

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

# 정수 인코딩 및 훈련 데이터 구성
sequences = list()

for sentence in preprocessed_headline:

  encoded = tokenizer.texts_to_sequences([sentence])[0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)

sequences[:11]

단어 집합의 크기 : 3494


[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [16]:
# 단어 인덱스
index_to_word = {}

for key, value in tokenizer.word_index.items():
  # 인덱스를 단어로 바꾸기 위해 index_to_word 생성
  index_to_word[value] = key

print('빈도수 상위 582번째 단어 : {}'.format(index_to_word[582]))

# 패딩
max_len = max(len(l) for l in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences[:3])

# 최우측 단어 레이블 분리
sequences = np.array(sequences)
X = sequences[:, :-1]
y = sequences[:, -1] # 레이블

# 레이블에 대해 원-핫 인코딩 수행
y = to_categorical(y, num_classes=vocab_size)

빈도수 상위 582번째 단어 : offer
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   99  269]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   99  269  371]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   99  269  371 1115]]


In [18]:
# 모델 설계
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

'''
하이퍼파라미터인 임베딩 벡터의 차원은 10, 은닉 상태의 크기는 128입니다.
다 대 일 구조의 LSTM을 사용합니다.
전결합층(Fully Connected Layer)을 출력층으로 단어 집합 크기만큼의 뉴런을 배치하여 모델을 설계합니다.
해당 모델은 마지막 시점에서 모든 가능한 단어 중 하나의 단어를 예측하는 다중 클래스 분류 문제를 수행하는 모델입니다.
다중 클래스 분류 문제의 경우, 출력층에 소프트맥스 회귀를 사용해야 하므로 활성화 함수로는 소프트맥스 함수를 사용하고,
손실 함수로 크로스 엔트로피 함수를 사용하여 200 에포크를 수행합니다.

'''

embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)


Epoch 1/200
244/244 - 14s - loss: 7.6356 - accuracy: 0.0285 - 14s/epoch - 56ms/step
Epoch 2/200
244/244 - 10s - loss: 7.1083 - accuracy: 0.0302 - 10s/epoch - 40ms/step
Epoch 3/200
244/244 - 10s - loss: 6.9722 - accuracy: 0.0356 - 10s/epoch - 43ms/step
Epoch 4/200
244/244 - 12s - loss: 6.8438 - accuracy: 0.0406 - 12s/epoch - 50ms/step
Epoch 5/200
244/244 - 11s - loss: 6.6921 - accuracy: 0.0438 - 11s/epoch - 44ms/step
Epoch 6/200
244/244 - 11s - loss: 6.5216 - accuracy: 0.0486 - 11s/epoch - 44ms/step
Epoch 7/200
244/244 - 9s - loss: 6.3305 - accuracy: 0.0510 - 9s/epoch - 39ms/step
Epoch 8/200
244/244 - 11s - loss: 6.1325 - accuracy: 0.0575 - 11s/epoch - 43ms/step
Epoch 9/200
244/244 - 11s - loss: 5.9406 - accuracy: 0.0604 - 11s/epoch - 44ms/step
Epoch 10/200
244/244 - 11s - loss: 5.7544 - accuracy: 0.0657 - 11s/epoch - 44ms/step
Epoch 11/200
244/244 - 11s - loss: 5.5831 - accuracy: 0.0714 - 11s/epoch - 44ms/step
Epoch 12/200
244/244 - 10s - loss: 5.4196 - accuracy: 0.0775 - 10s/epoch - 4

<keras.callbacks.History at 0x7fa1e00da3e0>