# LSTM vs GRU

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

- 전처리

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
df = pd.read_csv('ArticlesApril2017.csv',encoding='latin1')
df.head()

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...
1,,58def3237c459f24986d7c84,823,By GAIL COLLINS,article,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",3,OpEd,23,2017-04-01 00:23:58,Unknown,Meet the gang from under the bus.,The New York Times,Op-Ed,https://www.nytimes.com/2017/03/31/opinion/and...
2,,58def9f57c459f24986d7c90,575,By THE EDITORIAL BOARD,article,Venezuelaâs Descent Into Dictatorship,"['Venezuela', 'Politics and Government', 'Madu...",3,Editorial,22,2017-04-01 00:53:06,Unknown,A court ruling annulling the legislatureâs a...,The New York Times,Editorial,https://www.nytimes.com/2017/03/31/opinion/ven...
3,,58defd317c459f24986d7c95,1374,By MICHAEL POWELL,article,Stain Permeates Basketball Blue Blood,"['Basketball (College)', 'University of North ...",3,Sports,1,2017-04-01 01:06:52,College Basketball,"For two decades, until 2013, North Carolina en...",The New York Times,News,https://www.nytimes.com/2017/03/31/sports/ncaa...
4,,58df09b77c459f24986d7ca7,708,By DEB AMLEN,article,Taking Things for Granted,['Crossword Puzzles'],3,Games,0,2017-04-01 02:00:14,Unknown,In which Howard Barkin and Will Shortz teach u...,The New York Times,News,https://www.nytimes.com/2017/03/31/crosswords/...


In [4]:
headlines = [text for text in df.headline.values]
len(headlines)

886

In [5]:
# 노이즈 데이터 제거 - Unknown의 경우 
headlines = [text for text in headlines if text != "Unknown"]
len(headlines)

831

In [6]:
# 구둣점 제거, 소문자로 변환
from string import punctuation
def preprocessing(s):
    s = s.encode('utf8').decode('ascii','ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [7]:
headlines = [preprocessing(s) for s in headlines]
headlines[:3]

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship']

In [8]:
t = Tokenizer()
t.fit_on_texts(headlines)
vocab_size = len(t.word_index) + 1
vocab_size

2422

In [9]:
# 학습에 사용될 샘플 시권스 
sequences = []
for line in headlines:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
sequences[:11]

[[169],
 [169, 17],
 [169, 17, 665],
 [169, 17, 665, 367],
 [169, 17, 665, 367, 4],
 [169, 17, 665, 367, 4, 2],
 [169, 17, 665, 367, 4, 2, 666],
 [169, 17, 665, 367, 4, 2, 666, 170],
 [169, 17, 665, 367, 4, 2, 666, 170, 5],
 [169, 17, 665, 367, 4, 2, 666, 170, 5, 667],
 [6]]

In [10]:
max_len = max(len(s) for s in sequences)
max_len

19

In [11]:
sequences = pad_sequences(sequences,maxlen=max_len, padding='pre')          # pre가 디폴트값
sequences

array([[   0,    0,    0, ...,    0,    0,  169],
       [   0,    0,    0, ...,    0,  169,   17],
       [   0,    0,    0, ...,  169,   17,  665],
       ...,
       [   0,    0,    0, ..., 2420,   57,  365],
       [   0,    0,    0, ...,   57,  365,   94],
       [   0,    0,    0, ...,  365,   94, 2421]])

In [12]:
X = sequences[:,:-1]
y = sequences[:,-1]
Y = to_categorical(y)

X.shape, Y.shape

((5637, 18), (5637, 2422))

### 모델 생성

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU

from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping

In [18]:
lstm_checkpointer = ModelCheckpoint(
    'model/best_lstm.h5', monitor='accuracy',verbose=0, save_best_only=True
)
gru_checkpointer = ModelCheckpoint(
    'model/best_gru.h5', monitor='accuracy',verbose=0, save_best_only=True
)

early_stopping = EarlyStopping(monitor = 'accuracy', verbose=0,patience=20)

In [19]:
def LSTM_model(n_embed,n_node=128):
    model = Sequential([
        Embedding(vocab_size,n_embed,input_length=max_len-1),
        LSTM(n_node),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(
        optimizer='adam', loss = 'categorical_crossentropy',metrics=['accuracy']
    )
    history = model.fit(X,Y,epochs=200,verbose=0,callbacks=[lstm_checkpointer,early_stopping])
    print("LSTM")
    print(f'n_embed = {n_embed}와 n_node = {n_node}의 정확도는 {history.history["accuracy"][-1]:4f}')


In [20]:
def GRU_model(n_embed,n_node=128):
    model = Sequential([
        Embedding(vocab_size,n_embed,input_length=max_len-1),
        GRU(n_node),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(
        optimizer='adam', loss = 'categorical_crossentropy',metrics=['accuracy']
    )
    history = model.fit(X,Y,epochs=200,verbose=0,callbacks=[gru_checkpointer,early_stopping])
    print("GRU")
    print(f'n_embed = {n_embed}와 n_node = {n_node}의 정확도는 {history.history["accuracy"][-1]:4f}')


### 모델 학습

In [21]:
for n_embed in [8,10,12]:
    LSTM_model(n_embed)
    GRU_model(n_embed)

LSTM
n_embed = 8와 n_node = 128의 정확도는 0.787298
GRU
n_embed = 8와 n_node = 128의 정확도는 0.789250
LSTM
n_embed = 10와 n_node = 128의 정확도는 0.789604
GRU
n_embed = 10와 n_node = 128의 정확도는 0.787121
LSTM
n_embed = 12와 n_node = 128의 정확도는 0.789959
GRU
n_embed = 12와 n_node = 128의 정확도는 0.785169
