In [1]:
import glob
import os
import re
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
txt_file_path = os.getenv('HOME')+'/aiffel/exp4/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ['[Hook]', "I've been down so long, it look like up to me", 'They look up to me']


In [3]:
for idx, sentence in enumerate(raw_corpus):

    if idx > 10: break    
    print(sentence)

[Hook]
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face [Verse 1]
Somethin' ain't right when we talkin'
Somethin' ain't right when we talkin'


## 중괄호[ ]로 절과 후렴을 표시하므로 소문자로 정리한 뒤 해당 요소를 삭제

In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub("\[.*\]|\s-\s.*", "", sentence)
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

print(preprocess_sentence("This [Hook] [Verse 1 : Drake] @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


## 띄어쓰기를 세는 방식으로 start와 end를 포함하여 토큰이 15개 이하인 문장만 corpus에 추가

In [5]:
corpus_a = []
corpus = []

for sentence in raw_corpus:
    preprocessed_sentence = preprocess_sentence(sentence)
    count = 0
    for i in preprocessed_sentence:
        if i == " ":
            count += 1
    if count < 15:
        corpus_a.append(preprocessed_sentence)

corpus = list(set(corpus_a))
corpus[:10]

['<start> i swear to god <end>',
 '<start> promise of a better world to come <end>',
 '<start> for the childhood <end>',
 '<start> and learn to use my toys <end>',
 '<start> from the bottom so the tops <end>',
 '<start> heard muthafuckers talk , seen em drop <end>',
 '<start> sister confused , she went alone <end>',
 '<start> sometimes possibly maybe probably love <end>',
 '<start> so i ain t got a thing to lose . <end>',
 '<start> and went his way . <end>']

In [6]:
def check_space(corpus):
   
    c = 0
        
    for i in range(0, len(corpus)):
        if corpus[i] == " ":
            c +=1
    print("number of spaces ", c)
              

check_space(corpus[350])

number of spaces  14


In [7]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=12000, filters=' ',oov_token="<unk>")
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)   
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')      
    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[   2    5  516 ...    0    0    0]
 [   2  633   18 ...    0    0    0]
 [   2   28    6 ...    0    0    0]
 ...
 [   2 7746   32 ...    0    0    0]
 [   2  121   26 ...   44    3    0]
 [   2 6002 8150 ... 6210    3    0]] <keras_preprocessing.text.Tokenizer object at 0x7fe46c6f7590>


In [8]:
enc_input = tensor[:,:-1]
dec_input = tensor[:,1:]

print(enc_input[1])
print(dec_input[1])

[  2 633  18   9 159 141  10  75   3   0   0   0   0   0]
[633  18   9 159 141  10  75   3   0   0   0   0   0   0]


## 노드 예제에는 따로 트레이닝 데이터와 테스트 데이터 분할을 하지 않았는데 과제에는 하라고 나와서 진행하긴 했습니다만 테스트 데이터가 필요한 과제인지 모르겠습니다.

In [9]:
enc_train, enc_test, dec_train, dec_test = train_test_split(enc_input, dec_input, test_size = 0.2, random_state = 3)

In [10]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)
print("Source Test:", enc_test.shape)
print("Target Test:", dec_test.shape)

Source Train: (80567, 14)
Target Train: (80567, 14)
Source Test: (20142, 14)
Target Test: (20142, 14)


## 10epoch의 학습으로 좋은 결과물을 만들어야하는 만큼 배치사이즈를 줄이는 것이 합당해 보입니다.

## 배치 사이즈를 64로 설정

In [11]:
BUFFER_SIZE = len(enc_train)
BATCH_SIZE = 64
steps_per_epoch = len(enc_train) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1   
dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset_train = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset_train

<BatchDataset shapes: ((64, 14), (64, 14)), types: (tf.int32, tf.int32)>

In [12]:
BUFFER_SIZE = len(enc_test)
BATCH_SIZE = 64
steps_per_epoch = len(enc_test) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1   
dataset = tf.data.Dataset.from_tensor_slices((enc_test, dec_test))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset_test = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset_test

<BatchDataset shapes: ((64, 14), (64, 14)), types: (tf.int32, tf.int32)>

## 입력층 수 512, LSTM의 노드 수를 1024로 설정했습니다.
## LSTM을 여러겹 사용할 경우 각 층의 노드 숫자를 통일하는 것이 전반적으로 성능이 좋다고 나와서 일단은 두 LSTM층 모두 1024개의 노드로 유지 했습니다.
## 임베딩 사이즈를 키우는 것이 자연어처리에서의 섬세함과 연관된다고 하지만 은닉층의 노드보다 큰 숫자는 의미가 없기에 적절하게 512정도로 늘리기로 결정했습니다.

In [13]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 512
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [17]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe46c322210>

## 운이 좋게도 첫 학습에서 로스가 1.31로 나왔습니다.

In [24]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6144512   
_________________________________________________________________
lstm (LSTM)                  multiple                  6295552   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  12301025  
Total params: 33,133,793
Trainable params: 33,133,793
Non-trainable params: 0
_________________________________________________________________


In [22]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    while True:
        predict = model(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [23]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you more than i love myself <end> '

## 만족스러운 첫 작사 결과물