# 특정 문체를 흉내 내는 문장 생성하기

## 공개 도서에서 문장 추출하기

### Dataset Download

> https://pypi.python.org/pypi/Gutenberg 를 참고하여 Gutenberg를 Install

In [20]:
from gutenberg.acquire import load_etext
from gutenberg.query import get_etexts, get_metadata
from gutenberg.acquire import get_metadata_cache
from gutenberg.acquire.text import UnknownDownloadUriException
from gutenberg.cleanup import strip_headers
from gutenberg._domain_model.exceptions import CacheAlreadyExistsException

from keras.models import Input, Model
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import RMSprop

import numpy as np
import random

In [5]:
# 실행에 시간이 오래 걸리는 Cell
cache = get_metadata_cache()
try:
    cache.populate()
except CacheAlreadyExistsException:
    pass

In [6]:
for text_id in get_etexts('author', 'Shakespeare, William'):
    print(text_id, list(get_metadata('title', text_id))[0])

1536 The Life of Timon of Athens
1537 Pericles, Prince of Tyre
1538 Cymbeline
39939 Kuningas Henrik Viides
1527 Twelfth Night; Or, What You Will
1539 The Winter's Tale
1540 The Tempest
1541 The Life of Henry the Eighth
1528 The History of Troilus and Cressida
1543 A Lover's Complaint
1544 The Passionate Pilgrim
1545 The Passionate Pilgrim
43532 Miten haluatte
1529 All's Well That Ends Well
1546 Sonnets on Sundry Notes of Music
17930 Le songe d'une nuit d'été
24036 Sonnet 130
1041 Shakespeare's Sonnets
7185 Othello
7186 Was ihr wollt
1530 Measure for Measure
1045 Venus and Adonis
16893 Macbeth
1531 Othello, the Moor of Venice
1124 The History of Troilus and Cressida
1532 The Tragedy of King Lear
32797 Οθέλλος
Σαικσπείρου Τραγωδίαι Μέρος Β'
22045 La festa dels reis
Lo que vulgueu
22556 Cymbeline
44580 Loppu hyvä, kaikki hyvä
1127 The Tragedy of Othello, Moor of Venice
10281 Antony's Address over the Body of Caesar
From Julius Caesar
12842 A Fairy Tale in Two Acts Taken from Shakespeare (

1500 King Henry VI, First Part
1501 History of King Henry the Sixth, Second Part
1502 The History of King Henry the Sixth, Third Part
1503 The Tragedy of King Richard III
1504 The Comedy of Errors
1505 The Rape of Lucrece
26594 Twee Edellieden van Verona
1506 The Rape of Lucrece
1507 The Tragedy of Titus Andronicus
1508 The Taming of the Shrew
1509 The Two Gentlemen of Verona
1510 Love's Labour's Lost
1511 King John
1512 The Tragedy of King Richard the Second
1513 Romeo and Juliet
51691 De getemde feeks
1514 A Midsummer Night's Dream
1515 The Merchant of Venice
1516 King Henry IV, the First Part
1517 The Merry Wives of Windsor
1518 King Henry IV, Second Part
1519 Much Ado about Nothing
1520 Much Ado about Nothing
1521 The Life of King Henry V
1522 Julius Caesar
38901 Twelfth Night; or, What You Will
28150 Le Pèlerin amoureux
28151 Tout est bien qui finit bien
1523 As You Like It
1524 Hamlet, Prince of Denmark
49146 The Shakespeare Story-Book
1525 The Phoenix and the Turtle
1526 Twelfth

## 셰익스피어가 쓴 것 같은 문장 생성하기

### 데이터 수집 및 인코딩

In [7]:
shakespeare = strip_headers(load_etext(100))
plays = shakespeare.split('\nTHE END\n', 1)[-1] # 연극: [-1]

In [8]:
chars = list(sorted(set(plays)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
len(chars)

94

### 문자 단위 RNN 모델 정의

In [9]:
def char_rnn_model(num_chars, num_layers, num_nodes = 512, dropout = 0.1):
    input = Input(shape=(None, num_chars), name='input')
    prev = input
    
    for i in range(num_layers):
        prev = LSTM(num_nodes, return_sequences=True)(prev)
    
    dense = TimeDistributed(Dense(num_chars, name='dense', activation='softmax'))(prev)
    model = Model(input=[input], output=[dense])
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [22]:
CHUNK_SIZE = 160

def data_generator(all_text, num_chars, batch_size):
    X = np.zeros((batch_size, CHUNK_SIZE, num_chars))
    y = np.zeros((batch_size, CHUNK_SIZE, num_chars))
    
    while True:
        for row in range(batch_size):
            idx = random.randrange(len(all_text) - CHUNK_SIZE - 1)
            chunk = np.zeros((CHUNK_SIZE + 1, num_chars))
            
            for i in range(CHUNK_SIZE + 1):
                chunk[i, char_to_idx[all_text[idx + 1]]] = 1
                
            X[row, :, :] = chunk[:CHUNK_SIZE]
            y[row, :, :] = chunk[1:]
            
        yield X, y

In [16]:
model = char_rnn_model(len(chars), num_layers=2, num_nodes=640, dropout=0)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, None, 94)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 640)         1881600   
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 640)         3279360   
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 94)          60254     
Total params: 5,221,214
Trainable params: 5,221,214
Non-trainable params: 0
_________________________________________________________________




### 모델 학습 시키기

In [None]:
model.fit_generator(data_generator(plays, len(chars), batch_size=64),
                   epochs=10,
                   steps_per_epoch=2 * len(plays) / (256 * CHUNK_SIZE),
                   verbose=2
                   )

Epoch 1/10
 - 505s - loss: 1.8327 - acc: 0.6636
Epoch 2/10


### 결과 확인

In [None]:
def generate_output(model, training_text, start_index=None, diversity=None, amount=400):
    if start_index is None:
        start_index = random.randint(0, len(training_text) - CHUNK_SIZE - 1)
    generated = training_text[start_index: start_index + CHUNK_SIZE]
    yield generated + '#'
    
    for i in range(amount):
        x = np.zeros((1, len(generated), len(chars)))
        for t, char in enumerate(generated):
            x[0, t, char_to_idx[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        if diversity is None:
            next_index = np.argmax(preds[len(generated) - 1])
        else:
            preds = np.asarray(preds[len(generated) - 1]).astype('float64')
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)
            preds = exp_preds / np.sum(exp_preds)
            probas = np.random.multinomial(1, preds, 1)
            next_index = np.argmax(probas)     
        next_char = chars[next_index]
        yield next_char

        generated += next_char
    return generated

for ch in generate_output(model, plays):
    sys.stdout.write(ch)
    
print()