In [6]:
from glob import glob
import os
import re

txt_list = glob('./data/lyrics/*')
raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, 'r') as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

In [3]:
print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ['They say get ready for the revolution', "I think it's time we find some sorta solution", "Somebody's caught up in the endless pollution"]


In [69]:
print(raw_corpus[:100])

['They say get ready for the revolution', "I think it's time we find some sorta solution", "Somebody's caught up in the endless pollution", 'They need to wake up, stop living illusions I know you need to hear this', "Why won't somebody feel this", 'This is my wish that we all feel connected', 'This is my wish that nobodies neglected Be like a rocket baby', 'Be like a rocket Take off', 'Just fly, away (ay, ay)', 'To find your space Take off', 'Just fly, away (ay, ay)', 'To find your place Take off You know what they say about mixing the races', 'And in the end we got the same faces', 'My mama told me got love yourself first', 'And if you disagree, get off this damn earth I want to feel connected', "Don't want to be neglected", 'This is my wish that we all find our places', 'This is my wish that we all escalate (yeah) Be like a rocket baby', 'Be like a rocket Take off', 'Just fly, away (ay, ay)', 'To find your space Take off', 'Just fly, away (ay, ay)', 'To find your space Take off', 'Ju

In [7]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence
    

In [72]:
corpus =[]

for sentence in  raw_corpus:
    if len(sentence) == 0 or sentence[-1] == ']':
        continue
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)

In [44]:
import tensorflow as tf
from tensorflow.keras.layers import *
import numpy as np

In [85]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words = 12000,
        filters = ' ',
        oov_token = "<unk>" 
    )
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensors = []
    for i in range(len(tensor)):
        if len(tensor[i]) <= 15:
            tensors.append(tensor[i])
    tensors = np.array(tensors)
    tensors = tf.keras.preprocessing.sequence.pad_sequences(tensors, padding = 'post')
    return tensors, tokenizer

In [86]:
tensor, tokenizer = tokenize(corpus)

  tensors = np.array(tensors)


In [87]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

In [88]:
from sklearn.model_selection import train_test_split

In [89]:
enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size = 0.2, random_state = 42)

In [90]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (124124, 14)
Target Train: (124124, 14)


In [92]:
BUFFER_SIZE = len(enc_train)
BATCH_SIZE = 256
steps_per_epoch = BUFFER_SIZE // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1

In [93]:
dataset =  tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

In [94]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = Embedding(vocab_size, embedding_size)
        self.rnn_1 = LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = LSTM(hidden_size, return_sequences=True)
        self.linear = Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out

In [95]:
embedding_size = 512
hidden_size = 2048
model = TextGenerator(VOCAB_SIZE, embedding_size, hidden_size)

In [96]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
optimizer = tf.keras.optimizers.Adam()
model.compile(loss = loss, optimizer = optimizer)

In [97]:
model.fit(dataset, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f63b02638b0>

In [98]:
def generate_text(model, tokinizer, init_sentence = "<start>", max_len = 20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype = tf.int64)
    end_token = tokenizer.word_index["<end>"]
    
    while True:
        predict = model(test_tensor)
        predict_word = tf.argmax(tf.nn.softmax(predict, axis = -1), axis = -1)[:,-1]
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis = 0)], axis = -1)
        
        if predict_word.numpy()[0] == end_token or test_tensor.shape[1] >= max_len:
            break
            
    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + ' '
        
    return generated

In [105]:
model.evaluate(enc_val, dec_val)



2.1793322563171387

In [107]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you <end> '

In [106]:
generate_text(model, tokenizer, init_sentence="<start> i am", max_len=20)

'<start> i am not throwing away my shot <end> '

In [109]:
generate_text(model, tokenizer, init_sentence="<start> you and ", max_len=20)

'<start> you and me together <end> '

In [110]:
model.save('text_generator')



INFO:tensorflow:Assets written to: text_generator/assets


INFO:tensorflow:Assets written to: text_generator/assets
