In [199]:
import glob
import os
import re
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [200]:
txt_file_path = os.getenv('HOME')+'/aiffel/exp4/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ['[Hook]', "I've been down so long, it look like up to me", 'They look up to me']


In [201]:
for idx, sentence in enumerate(raw_corpus):

    if idx > 10: break    
    print(sentence)

[Hook]
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face [Verse 1]
Somethin' ain't right when we talkin'
Somethin' ain't right when we talkin'


## 중괄호[ ]로 절과 후렴을 표시하므로 소문자로 정리한 뒤 해당 요소를 삭제

In [202]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub("\[.*\]|\s-\s.*", "", sentence)
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

print(preprocess_sentence("This [Hook] [Verse 1 : Drake] @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


## 띄어쓰기를 세는 방식으로 start와 end를 포함하여 토큰이 15개 이하인 문장만 corpus에 추가

In [205]:
corpus = []

for sentence in raw_corpus:
    preprocessed_sentence = preprocess_sentence(sentence)
    count = 0
    for i in preprocessed_sentence:
        if i == " ":
            count += 1
    if count < 15:
        corpus.append(preprocessed_sentence)

corpus[:10]

['<start>  <end>',
 '<start> i ve been down so long , it look like up to me <end>',
 '<start> they look up to me <end>',
 '<start> i got fake people showin fake love to me <end>',
 '<start> straight up to my face , straight up to my face <end>',
 '<start> i ve been down so long , it look like up to me <end>',
 '<start> they look up to me <end>',
 '<start> i got fake people showin fake love to me <end>',
 '<start> straight up to my face , straight up to my face <end>',
 '<start> somethin ain t right when we talkin <end>']

In [206]:
def check_space(corpus):
   
    c = 0
        
    for i in range(0, len(corpus)):
        if corpus[i] == " ":
            c +=1
    print("number of spaces ", c)
              

check_space(corpus[350])

number of spaces  4


In [207]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=12000, filters=' ',oov_token="<unk>")
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)   
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')      
    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[  2   3   0 ...   0   0   0]
 [  2   4  95 ...  10  12   3]
 [  2  38 133 ...   0   0   0]
 ...
 [  2   3   0 ...   0   0   0]
 [  2   3   0 ...   0   0   0]
 [  2   3   0 ...   0   0   0]] <keras_preprocessing.text.Tokenizer object at 0x7f558942ce90>


In [208]:
tensor.shape

(167475, 15)

In [209]:
enc_input = tensor[:,:-1]
dec_input = tensor[:,1:]

print(enc_input[1])
print(dec_input[1])

[  2   4  95 105  58  31 167   5  11 133  23  29  10  12]
[  4  95 105  58  31 167   5  11 133  23  29  10  12   3]


## 8:2로 train 데이터와 validation 데이터를 분리했습니다.

In [210]:
enc_train, enc_val, dec_train, dec_val = train_test_split(enc_input, dec_input, test_size = 0.2, random_state = 12)

In [211]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)
print("Source Val:", enc_val.shape)
print("Target Val:", dec_val.shape)

Source Train: (133980, 14)
Target Train: (133980, 14)
Source Val: (33495, 14)
Target Val: (33495, 14)


In [276]:
BUFFER_SIZE = len(enc_train)
BATCH_SIZE = 64
steps_per_epoch = len(enc_train) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1   
dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset_train = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset_train

<BatchDataset shapes: ((64, 14), (64, 14)), types: (tf.int32, tf.int32)>

In [277]:
BUFFER_SIZE = len(enc_val)
BATCH_SIZE = 64
steps_per_epoch = len(enc_val) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1   
dataset = tf.data.Dataset.from_tensor_slices((enc_val, dec_val))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset_val = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset_val

<BatchDataset shapes: ((64, 14), (64, 14)), types: (tf.int32, tf.int32)>

## LSTM을 기본으로 하이퍼파라미터튜닝으로는 과제에서 주어진 목표 val_loss에 근접했으나 2.2를 하회하는 결과값을 얻을 수 없었습니다.
## bidirectional LSTM과 Dropout등을 활용하여 모델을 만들어보았지만 val_loss는 획기적으로 줄여도 정작 모델 출력이 되지 않는 현상이 있었습니다.

In [278]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 512
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [279]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset_train, epochs=10, validation_data=dataset_val, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f54021aa310>

## 

In [280]:
model.summary()

Model: "text_generator_73"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_73 (Embedding)     multiple                  6144512   
_________________________________________________________________
lstm_142 (LSTM)              multiple                  6295552   
_________________________________________________________________
lstm_143 (LSTM)              multiple                  8392704   
_________________________________________________________________
dense_71 (Dense)             multiple                  12301025  
Total params: 33,133,793
Trainable params: 33,133,793
Non-trainable params: 0
_________________________________________________________________


In [225]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    while True:
        predict = model(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

## 만족스러운 첫 작사 결과물

In [23]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you more than i love myself <end> '

## 두번째 모델 작사 결과물

In [284]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you , i love you <end> '