# LSTM을 이용한 텍스트 생성

- 데이터 다운로드 : 캐글 New York Times Comments

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

데이터 읽어오기

In [3]:
df = pd.read_csv('ArticlesApril2017.csv',encoding='latin1')
df.head()

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...
1,,58def3237c459f24986d7c84,823,By GAIL COLLINS,article,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",3,OpEd,23,2017-04-01 00:23:58,Unknown,Meet the gang from under the bus.,The New York Times,Op-Ed,https://www.nytimes.com/2017/03/31/opinion/and...
2,,58def9f57c459f24986d7c90,575,By THE EDITORIAL BOARD,article,Venezuelaâs Descent Into Dictatorship,"['Venezuela', 'Politics and Government', 'Madu...",3,Editorial,22,2017-04-01 00:53:06,Unknown,A court ruling annulling the legislatureâs a...,The New York Times,Editorial,https://www.nytimes.com/2017/03/31/opinion/ven...
3,,58defd317c459f24986d7c95,1374,By MICHAEL POWELL,article,Stain Permeates Basketball Blue Blood,"['Basketball (College)', 'University of North ...",3,Sports,1,2017-04-01 01:06:52,College Basketball,"For two decades, until 2013, North Carolina en...",The New York Times,News,https://www.nytimes.com/2017/03/31/sports/ncaa...
4,,58df09b77c459f24986d7ca7,708,By DEB AMLEN,article,Taking Things for Granted,['Crossword Puzzles'],3,Games,0,2017-04-01 02:00:14,Unknown,In which Howard Barkin and Will Shortz teach u...,The New York Times,News,https://www.nytimes.com/2017/03/31/crosswords/...


우리 관심 : 헤드라인

In [4]:
df.columns

Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')

### 데이터 전처리

In [5]:
# headline열 데이터만 사용할 예정. Null값 검사
df.headline.isnull().sum()          # 널값 없다

0

In [6]:
headlines = [text for text in df.headline.values]
len(headlines)

886

In [7]:
headlines[:5]

['Finding an Expansive View  of a Forgotten People in Niger',
 'And Now,  the Dreaded Trump Curse',
 'Venezuelaâ\x80\x99s Descent Into Dictatorship',
 'Stain Permeates Basketball Blue Blood',
 'Taking Things for Granted']

In [8]:
# 노이즈 데이터 제거 - Unknown의 경우 
headlines = [text for text in headlines if text != "Unknown"]
len(headlines)

831

In [9]:
# 구둣점 제거, 소문자로 변환
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def preprocessing(s):
    s = s.encode('utf8').decode('ascii','ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [11]:
headlines = [preprocessing(s) for s in headlines]
headlines[:3]

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship']

토큰화하고 제일 긴 것 찾아서 그걸 기준으로 패딩해주고, y값 찾아서 학습하기 

In [12]:
# 단어 집합을 만들고, 그 크기를 확인
t = Tokenizer()
t.fit_on_texts(headlines)
vocab_size = len(t.word_index) + 1
vocab_size

2422

In [13]:
# 학습에 사용될 샘플 시퀀스
sequences = []
for line in headlines:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11]

[[169],
 [169, 17],
 [169, 17, 665],
 [169, 17, 665, 367],
 [169, 17, 665, 367, 4],
 [169, 17, 665, 367, 4, 2],
 [169, 17, 665, 367, 4, 2, 666],
 [169, 17, 665, 367, 4, 2, 666, 170],
 [169, 17, 665, 367, 4, 2, 666, 170, 5],
 [169, 17, 665, 367, 4, 2, 666, 170, 5, 667],
 [6]]

In [14]:
max_len = max(len(s) for s in sequences)
max_len

19

In [15]:
sequences = pad_sequences(sequences,maxlen=max_len, padding='pre')          # pre가 디폴트값
sequences

array([[   0,    0,    0, ...,    0,    0,  169],
       [   0,    0,    0, ...,    0,  169,   17],
       [   0,    0,    0, ...,  169,   17,  665],
       ...,
       [   0,    0,    0, ..., 2420,   57,  365],
       [   0,    0,    0, ...,   57,  365,   94],
       [   0,    0,    0, ...,  365,   94, 2421]])

In [16]:
X = sequences[:,:-1]
y = sequences[:, -1]
Y = to_categorical(y)           # Y가 원핫 인코딩을 해서 엑스보다 더 sparse한 행렬이 되었다. 

X.shape, Y.shape

((5637, 18), (5637, 2422))

### 모델 정의 / 설정 / 학습
- Embedding
- GRU

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [18]:
# 임베딩 벡터 10차원, LSTM 노드 크기 128
model = Sequential([
    Embedding(vocab_size,10, input_length=max_len-1),
    GRU(128),
    Dense(vocab_size,activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 18, 10)            24220     
_________________________________________________________________
gru (GRU)                    (None, 128)               53760     
_________________________________________________________________
dense (Dense)                (None, 2422)              312438    
Total params: 390,418
Trainable params: 390,418
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(
    optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy']
)

In [20]:
history = model.fit(X,Y,epochs=200,verbose=2)

Epoch 1/200
177/177 - 2s - loss: 7.4269 - accuracy: 0.0424
Epoch 2/200
177/177 - 2s - loss: 6.9644 - accuracy: 0.0424
Epoch 3/200
177/177 - 2s - loss: 6.8487 - accuracy: 0.0429
Epoch 4/200
177/177 - 2s - loss: 6.7071 - accuracy: 0.0495
Epoch 5/200
177/177 - 2s - loss: 6.5110 - accuracy: 0.0593
Epoch 6/200
177/177 - 2s - loss: 6.2918 - accuracy: 0.0623
Epoch 7/200
177/177 - 2s - loss: 6.0583 - accuracy: 0.0656
Epoch 8/200
177/177 - 2s - loss: 5.8204 - accuracy: 0.0715
Epoch 9/200
177/177 - 2s - loss: 5.5702 - accuracy: 0.0797
Epoch 10/200
177/177 - 2s - loss: 5.3106 - accuracy: 0.0915
Epoch 11/200
177/177 - 2s - loss: 5.0554 - accuracy: 0.1043
Epoch 12/200
177/177 - 2s - loss: 4.8115 - accuracy: 0.1252
Epoch 13/200
177/177 - 2s - loss: 4.5687 - accuracy: 0.1517
Epoch 14/200
177/177 - 2s - loss: 4.3480 - accuracy: 0.1872
Epoch 15/200
177/177 - 2s - loss: 4.1354 - accuracy: 0.2233
Epoch 16/200
177/177 - 2s - loss: 3.9385 - accuracy: 0.2583
Epoch 17/200
177/177 - 2s - loss: 3.7587 - accura

### 모델 검증

In [21]:
from my_util import sentence_generation

In [22]:
print(sentence_generation(model, t, max_len, 'i',10))
print(sentence_generation(model, t, max_len,'epa',10))
print(sentence_generation(model, t, max_len, 'niger',10))

i survived a sarin gas attack the great box of china
epa the americans season 5 episode 6 recap im going home
niger the presidents generals of edition gorsuch benefit to end from
