# 프로젝트: 멋진 작사가 만들기

## 데이터 읽어오기

In [3]:
import re                  
import numpy as np         
import tensorflow as tf 
import glob
import os

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ["Now I've heard there was a secret chord", 'That David played, and it pleased the Lord', "But you don't really care for music, do you?"]


## 데이터 정제

In [4]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue   
    if idx > 30: break   
    print(sentence)

Now I've heard there was a secret chord
That David played, and it pleased the Lord
But you don't really care for music, do you?
It goes like this
The fourth, the fifth
The minor fall, the major lift
The baffled king composing Hallelujah Hallelujah
Hallelujah
Hallelujah
Hallelujah Your faith was strong but you needed proof
You saw her bathing on the roof
Her beauty and the moonlight overthrew her
She tied you
To a kitchen chair
She broke your throne, and she cut your hair
And from your lips she drew the Hallelujah Hallelujah
Hallelujah
Hallelujah
Hallelujah You say I took the name in vain
I don't even know the name
But if I did, well really, what's it to you?
There's a blaze of light
In every word
It doesn't matter which you heard
The holy or the broken Hallelujah Hallelujah
Hallelujah
Hallelujah
Hallelujah I did my best, it wasn't much
I couldn't feel, so I tried to touch
I've told the truth, I didn't come to fool you
And even though


In [5]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()       
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)       
    sentence = re.sub(r'[" "]+', " ", sentence)              
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)  
    
    sentence = sentence.strip()

    sentence = '<start> ' + sentence + ' <end>'      
    
    if "verse" in sentence:
        sentence = sentence.replace("verse", "")
    if "chorus" in sentence:
        sentence = sentence.replace("chorus", "")
    
    
    return sentence

In [6]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if len(sentence) == 1: continue    
    corpus.append(preprocess_sentence(sentence))
corpus.remove('<start>  <end>')         

corpus[:30]
print(len(corpus))

175950


In [7]:
def tokenize(corpus):
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=15000,  
        filters=' ',    
        oov_token="<unk>"  
    )
    tokenizer.fit_on_texts(corpus)  

    tensor = tokenizer.texts_to_sequences(corpus)  
   
    for num in tensor:
        if len(num) >= 29:
            tensor = np.delete(tensor, num)
            
    print(len(tensor))
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',maxlen=15)  

    print(tensor,tokenizer)
    return tensor, tokenizer

    
tensor, tokenizer = tokenize(corpus)

  arr = asarray(arr)


152501
[[   2   50    5 ...    0    0    0]
 [   2   92   12 ...    0    0    0]
 [   2   78  921 ...    0    0    0]
 ...
 [   5   22    9 ...   10 1013    3]
 [  37   15 9056 ...  876  642    3]
 [   2    7   34 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f9a14c61550>


In [8]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : i
6 : the
7 : you
8 : and
9 : a
10 : to


In [12]:
src_input = tensor[:, :-1]  
tgt_input = tensor[:, 1:]    

print(src_input[0])
print(tgt_input[0])

[   2   50    5   91  296   64   57    9  968 6044    3    0    0    0]
[  50    5   91  296   64   57    9  968 6044    3    0    0    0    0]


## 평가 데이터셋 분리

In [10]:
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input,
                                                          tgt_input,
                                                          train_size = 0.8)

In [13]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (122000, 14)
Target Train: (122000, 14)


In [14]:
BUFFER_SIZE = len(src_input)         
BATCH_SIZE = 256                     
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1    

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

## 인공지능 만들기

In [15]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.drop  = tf.keras.layers.Dropout(0.5)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.drop(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [16]:
for src_sample, tgt_sample in dataset.take(1): break
model(src_sample)

<tf.Tensor: shape=(256, 14, 15001), dtype=float32, numpy=
array([[[ 9.0868147e-05,  1.9450799e-04,  1.6385816e-04, ...,
         -2.8930910e-04, -4.0756033e-05,  4.3191649e-05],
        [-1.6667617e-04,  4.2466997e-04,  1.3034903e-04, ...,
         -3.7367956e-04, -4.3836848e-05,  3.6494841e-06],
        [-3.0145916e-04,  5.0106266e-04, -2.1126274e-05, ...,
         -3.1820615e-04, -1.8106023e-04, -1.3844798e-04],
        ...,
        [ 1.2750608e-04,  1.0015062e-03,  2.5199790e-04, ...,
         -1.8810046e-04, -5.1423500e-04, -7.7926525e-04],
        [ 3.1586282e-04,  1.0989244e-03,  2.1397055e-04, ...,
         -4.9886945e-05, -5.3765363e-04, -7.4894220e-04],
        [ 5.2299100e-04,  1.1794909e-03,  1.9736223e-04, ...,
          3.5670440e-05, -5.0396571e-04, -7.2890020e-04]],

       [[-9.1361301e-07, -1.2405004e-04, -7.4469426e-05, ...,
         -6.5202017e-05,  2.3403889e-04,  3.2277475e-04],
        [ 7.5359843e-05, -5.3671194e-04, -2.3804678e-04, ...,
         -5.6604167e-05, 

In [17]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3840256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  15376025  
Total params: 32,855,961
Trainable params: 32,855,961
Non-trainable params: 0
_________________________________________________________________


In [18]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f99982c6250>

In [19]:
results = model.evaluate(enc_val,  dec_val, verbose=2)

print(results)

954/954 - 13s - loss: 1.1756
1.1755937337875366


In [20]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    
    while True:
        predict = model(test_tensor)  
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]   
        
        test_tensor = tf.concat([test_tensor, 
                                 tf.expand_dims(predict_word, axis=0)], axis=-1)

        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
     
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated   

In [21]:
generate_text(model, tokenizer, init_sentence="<start> i love")

'<start> i love you , liberian girl <end> '

In [22]:
generate_text(model, tokenizer, init_sentence="<start> do you")


'<start> do you remember the time <end> '

In [23]:
generate_text(model, tokenizer, init_sentence="<start> only")


'<start> only the <unk> can disclose <end> '

In [36]:
generate_text(model, tokenizer, init_sentence="<start> why")

'<start> why you wanna get with me <end> '

In [37]:
generate_text(model, tokenizer, init_sentence="<start> who")

'<start> who s that casting devious stares in my direction ? <end> '