# LSTM으로 텍스트 생성하기
## 글자 수준의 LSTM 텍스트 생성 모델 구현
* [dataset](https://s3.amazonaws.com/text-datasets/nietzsche.txt) : 19세기 후반 독일의 철학자 니체의 글을 사용하겠음(영어로 번역된 글임)
* 학습할 언어 모델은 일반적인 영어 모델이 아니라 니체의 문체와 특정 주제를 따르는 모델일 것임

## 데이터 전처리
* 먼저 말뭉치를 다운로드하고 소문자로 바꿈

In [1]:
import keras
import numpy as np

# path = keras.utils.get_file(
#     'nietzsche.txt',
#     origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')

path = './nietzsche.txt'
text = open(path).read().lower()
print('말뭉치 크기:', len(text))

Using TensorFlow backend.


말뭉치 크기: 600893


In [2]:
type(text)

str

* `maxlen` 길이를 가진 시퀀스를 중복하여 추출함
* 추출된 시퀀스를 원-핫 인코딩으로 변환하고 크기가 `(sequences, maxlen, unique_characters)`인 3D 넘파이 배열 `x`로 합침
* 동시에 훈련 샘플에 상응하는 타깃을 담은 배열 `y`를 준비함. 
* 타깃은 추출된 시퀀스 다음에 오는 원-핫 인코딩된 글자임

In [3]:
# 60개 글자로 된 시퀀스를 추출합니다.
maxlen = 60

# 세 글자씩 건너 뛰면서 새로운 시퀀스를 샘플링합니다.
step = 3

# 추출한 시퀀스를 담을 리스트
sentences = []

# 타깃(시퀀스 다음 글자)을 담을 리스트
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('시퀀스 개수:', len(sentences))

시퀀스 개수: 200278


In [14]:
sentences[:2]

['preface\n\n\nsupposing that truth is a woman--what then? is the',
 'face\n\n\nsupposing that truth is a woman--what then? is there ']

In [4]:
# 말뭉치에서 고유한 글자를 담은 리스트
chars = sorted(list(set(text)))
print('고유한 글자:', len(chars))

# chars 리스트에 있는 글자와 글자의 인덱스를 매핑한 딕셔너리
char_indices = dict((char, chars.index(char)) for char in chars)

고유한 글자: 58


In [10]:
[char_indices.items()]

[dict_items([('\n', 0), (' ', 1), ('!', 2), ('"', 3), ("'", 4), ('(', 5), (')', 6), (',', 7), ('-', 8), ('.', 9), ('0', 10), ('1', 11), ('2', 12), ('3', 13), ('4', 14), ('5', 15), ('6', 16), ('7', 17), ('8', 18), ('9', 19), (':', 20), (';', 21), ('=', 22), ('?', 23), ('[', 24), (']', 25), ('_', 26), ('a', 27), ('b', 28), ('c', 29), ('d', 30), ('e', 31), ('f', 32), ('g', 33), ('h', 34), ('i', 35), ('j', 36), ('k', 37), ('l', 38), ('m', 39), ('n', 40), ('o', 41), ('p', 42), ('q', 43), ('r', 44), ('s', 45), ('t', 46), ('u', 47), ('v', 48), ('w', 49), ('x', 50), ('y', 51), ('z', 52), ('채', 53), ('챈', 54), ('챕', 55), ('챘', 56), ('횈', 57)])]

In [None]:
# 글자를 원-핫 인코딩하여 0과 1의 이진 배열로 바꿉니다.
print('벡터화...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [16]:
print(x.shape, y.shape)

(200278, 60, 58) (200278, 58)


## 네트워크 구성
* 네트워크는 하나의 `LSTM` 층과 그 뒤에 `Dense` 분류기가 뒤따름
* 분류기는 가능한 모든 글자에 대한 소프트맥스 출력을 만듬
* 순환 신경망이 시퀀스 데이터를 생성하는 유일한 방법은 아님. 최근에는 1D 컨브넷도 이런 작업에 아주 잘 들어 맞는다는 것이 밝혀짐

In [17]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))  # input_shape:(60, 58)
model.add(layers.Dense(len(chars), activation='softmax'))



















In [18]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)













## 언어 모델 훈련과 샘플링

훈련된 모델과 시드로 쓰일 간단한 텍스트가 주어지면 다음과 같이 반복하여 새로운 텍스트를 생성할 수 있음

1.	지금까지 생성된 텍스트를 주입하여 모델에서 다음 글자에 대한 확률 분포를 뽑는다.
2.	특정 온도로 이 확률 분포의 가중치를 조정함
3.	가중치가 조정된 분포에서 무작위로 새로운 글자를 샘플링함
4.	새로운 글자를 생성된 텍스트의 끝에 추가함

* 샘플링 함수 : 모델에서 나온 원본 확률 분포의 가중치를 조정하고 새로운 글자의 인덱스를 추출함

In [97]:
# preds : (58,)
# temperature : 출력 분포의 엔트로피 양을 결정함
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)      # (58,)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [102]:
# for test by SOO
if 0:
    t_text = text[10: 10 + maxlen]    # maxlen:60
    print('seed_text:', t_text)
    t_sampled = np.zeros((1, maxlen, len(chars)))  # (1, 60, 58)
    for t, char in enumerate(t_text):
        t_sampled[0, t, char_indices[char]] = 1.
    # 다음 글자를 샘플링합니다
    t_preds = model.predict(t_sampled, verbose=0)[0]  # t_preds:(58,)
    print('-->t_preds:', t_preds)
    t_next_index = sample(t_preds, 0.2)  # 0.2 -> 'u', 0.5 -> 'u', 1.0 -> 'u' or 'm' or 'a', 1.2 -> 'a'
    t_next_char = chars[t_next_index]
    print('-->t_next_char:', t_next_char)

seed_text: supposing that truth is a woman--what then? is there not gro
-->t_preds: [2.04661768e-03 4.21298342e-03 1.58849753e-06 2.66286370e-05
 1.19353444e-06 1.31159453e-07 1.08049710e-06 3.41411127e-04
 1.34028975e-04 2.43176677e-04 3.39477054e-07 5.88920739e-05
 1.82197982e-05 3.38264022e-06 4.55037298e-06 6.74780551e-07
 1.25729423e-06 1.85078297e-05 2.92423210e-05 1.11045865e-05
 2.99066642e-06 6.37116443e-07 9.32267383e-08 1.33383378e-06
 4.34635439e-08 4.95219126e-07 4.18969591e-07 7.19812000e-03
 3.92625993e-03 3.39007005e-03 1.02746196e-01 3.07175089e-02
 6.44316152e-03 2.98748794e-03 6.77342177e-04 1.06499549e-02
 2.27596001e-06 1.22093747e-03 6.50091609e-03 9.71132368e-02
 6.24694452e-02 1.88762516e-01 4.29162942e-03 1.38727528e-05
 4.82208878e-02 5.57938963e-02 3.50911096e-02 2.19881117e-01
 5.03178686e-02 4.86321226e-02 5.46904957e-05 5.73320966e-03
 2.66457710e-06 9.65281046e-08 1.08385485e-07 1.10744757e-07
 1.38341605e-07 1.04095619e-07]
-->t_next_char: u


In [103]:
import random
import sys

random.seed(42)
start_index = random.randint(0, len(text) - maxlen - 1)

# 60 에포크 동안 모델을 훈련합니다
for epoch in range(1, 60):
    print('에포크', epoch)
    # 데이터에서 한 번만 반복해서 모델을 학습합니다
    model.fit(x, y, batch_size=128, epochs=1)

    # 무작위로 시드 텍스트를 선택합니다
    seed_text = text[start_index: start_index + maxlen]
    print('--- 시드 텍스트: "' + seed_text + '"')

    # 여러가지 샘플링 온도를 시도합니다
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ 온도:', temperature)
        generated_text = seed_text
        sys.stdout.write(generated_text)

        # 시드 텍스트에서 시작해서 400개의 글자를 생성합니다
        for i in range(400):
            # 지금까지 생성된 글자를 원-핫 인코딩으로 바꿉니다
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            # 다음 글자를 샘플링합니다
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

에포크 1
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the spense of the speak and some more in the suffering to the conscious and prose of the stranger and all the spiritually to the discorring and the speak the present to the speak of the spiritually and the spense of the discorring and all the stranger to the spense of the spiritually to the constitute to the stronger and of the discornation of the constitute that it is all the suffering to the s
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for for the discornity of the sto presself to the moral and peaconing and self-constitional to the fundamentally
dound in the compresses of him to the dound the work still at refarred allist in some discornated and at himself appers to the will to the present, and in all the stronger of the dors of the discordentely, and profound and called to the very repro

problemsi-caliter,pahy and ainapears, takestyment of authoning: in like i gentcence, his predice views--what?
pros,
was abrarisys--i
new sprinating? whatid his heartous highity of
ypart and
"willurk-dod christioran impulse, or intreer ove more countrusings, though doe
에포크 9
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the soul, and in the same the sense of the sense of the soul, and the same of the same of the same and conscience of the superities of the such a morality of the morality, and the sense of the sense of the soul, and in the sense of the same and the souls of the sentrant of the sentrant of the morality of the such a more in the sense of the constantly and soul, the conscience of the sense of the 
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the demand to disermintation, and in the same of the same and in the liberty of the com

  """


her work, at she centuriduall to the lictlerager successty, and therefore, propents, and carncautions
id account should eow thils, the wiles when actions in morals by mens fast"s: "the effect a heart in every has not to things natural therebals, but may make that ronggure, f
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through forward--the macifte the suw.-nother vicersitive once. "statiling.,  through the most grandim and threle eeperious sin  i woman of boldarness; 
son with
thet!--man, such diriny, "perhaps?kv. thereof--un, the grest mose and day which
ethe
precisely mus
necessary, how eurofy where--it see the viins a delicate than consulenth average were it is souspess,
youc on
yourldarby once german to which changers
에포크 10
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the spirit and spirit and life and faith of a man of the present to the spirit 

deticion of the great the individual souls to t
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for his most religies to great a prijunilate this dectah!"--to be even fellowss. it is at "god; and something
ought, the , as great charact on what voluntarily concturion in the
doaw"
our to the man of occhebes, it count as peopld us, sholls in theerst with man yet. how autt this dreadful stright
in
its banglings
and re's egnish at as
"gymman a secies the
"christianity--and "feels that which timin a
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for a backced,"
in
anying thir higher have generation--and
becausalss simikled,h--the work of this racely. in his half
. eurment a anlat!" heads, re"marablance and
mor, sul, and not be conscience"; pre"aration sid with but-""rephim of valunile origir, now they elability, and his fidace carefuner, the noblives"
fiftimately, to age anythinging,
plastt,
how rather: but it is enommens, . about contsent,


through for sense and beings and fire many capadarly, and almost a desire to have fond the person has the precisely wind something be has account and presence of the common--it is a bount of the ears, and no galving that they makes something itself with regards there is a human needs--but the contain before as the constraint, and also makes of the sense
that when i a hours that the best on the same sense an
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for more unperacremation--side in does reffiecte
painful of secentfulian with late
a later rupanding as very
or does his here
formy his desectionationhece, there perhaps--whether, are the as,istion
and gregmand and 
only as loes when the become absguind frapted of
brainges in most feed as.

1r챕all and contrary and to trandling, lightrionardwite enddem instupidial
soul, are
beings his claimes a hastt
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through forcignly, name ourse inborderhah--if th

--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for instance, it is a person who sees higher and attain and possess of the spirit to and pricity, and in the present of the spirit to the comparious the compalison of the senses of the spirit to and comparistic and the comparing of the present the senses of the senses and desire to the sense of the senses of the senses of the senses of the self-instoold also the spirits of the senses and possible to
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through fore the stronger before the word and all the strict on being faculty and the neighbour of the spirits of the senses, who has kion pertance that the taste, and
something that a more then the case and nature of the reason at the germany similation of the spirit, and something modest all, and for the senses of elevate the self-reading and the stronger, the last exelcent and c

doo reeatriesoly onefareneel, wantenunevetcarest suffibable, altt, he tolenor iw s thit  hab6d nean a risechismocter donaliat of que that p"facurk, sucche reared, teecull mespet
cegainsly
1wanwerinev, theis hiso aityment iseaste frealitill
wich i itmertisrily and ,                      6        횈e챕]     챈ohex7hetonono횈d aggated, tha chatestat geme o thi
에포크 41
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through fo챕 a챈s [n[

w 
t ti 2lusi  reite e챈6 ae
printis not an hiep   ao 챈stofracio a dore a ga챕 thaling챈sst  we preithit채채 and ti5 te lore to tallemco achism채 andititm챈 anourt5. ae8one ae. the sain 1 dall and ]lemishall bithejpelenc as in
a centi wing rereri] and oree4 th5 prih 챈s t             oo6kra0s_ to nofrionnat챈,챈s, a centemin andiso  sororler in[챕, ae bithlaratom_
채men[. -in the tin fev[qus a cexe
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through fo횈

through fo      
 a outa e
l    o  w cfollelnoo a   t  u  esoc-  os ycott h heti er lallent wheul  or birts   nla the  re i vind  tt    hes  f  hinen inmter ralat eifr wit o al vota a r whh fora tha e  iec   , hra olit t  ou  
i os
-es serarll i s  pearere te lgsttht  tson m ape rec th u t psre th wi  ot  os  thet   oho  lou  k  linenr iroot tlittr thos in nlite s  bi t  cupso te. a auliseho y     ad toeerg
에포크 45
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through foy thne megna  ere t e h hr  ; atass ond an there o as the f athahesss hf o t, anddsf thess  d atdy  hem ga  e oan esoe kand ne o se twhts hpse t"ec t ethe  thestp hes thewh tononed a ther one frher th here bgind al h acagrerp han ain hle   the  we imce  hd bee one "e bsth tdhegev-ter tod  a ros th he vtt ee and inen thesth the
hhech wh t teste " the detn "the  ase th hor troet the es thec rde the 
------ 온도: 0.5
the sl

through fo                                                                                                                                                                                                                                                                                                                                                                                                                
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through fo                                                                                                                                                                                                                                                                                                                                                                                                                
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through fo                                      

* 높은 온도에서 생성된 텍스트는 아주 흥미롭고 놀라우며 창의적이기도 합니다. 이따금 꽤 그럴싸하게 보이는 완전히 새로운 단어를 창조합니다(‘begarmed’와 ‘isharent’ 같은 단어입니다). 높은 온도에서는 국부적인 구조가 무너지기 시작합니다. 대부분의 단어가 어느정도 무작위한 문자열로 보입니다. 확실히 이 네트워크에서는 텍스트 생성에 가장 좋은 온도는 0.5입니다. 항상 다양한 샘플링 전략으로 실험해 봐야합니다! 학습된 구조와 무작위성 사이에 균형을 잘 맞추면 흥미로운 것을 만들 수 있습니다.