In [6]:
from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
import re



def get_text(txt_file):
    # Receive a txt file with html labels and return a string with all the content clean.

    with io.open(txt_file, encoding='utf-8') as f:
        text = f.read().lower()
    
    return text


def clean_text(text):
    chars_to_clean = ['\x13', '"', "'", '?', '\x92', '©', 'ª', '¬',
                      '®', '°', '³','¹', 'º', '¿', 'ç', 'ń', 'œ', 'š', 'ˆ', '̶', 'о', '—',
                      '′', '›', '‼', '♥', '♪', '\U0006192f', '\t', '!', '#', '$', '%',
                      '&', '*', '+', '-', '/', ':', ';', '<', '=', '>', '@', '\\', '^',
                      '_', '`', '~', '\x85', '¡', '¤', '¥', '¨', '\xad', '¯',
                      '±', '´', '·', '½', '×', 'ᮤ', '–', '‘', '’', '“', '”', '•', '…',
                      '\ufeff', '�', '(', ')','[', ']', '{', '}']
    
    a = ['à', 'â', 'ã', 'ä', 'å']
    e = ['è', 'ê', 'ë', 'ė']
    i = ['ì', 'î', 'ï',]
    o = ['ò', 'ô', 'õ', 'ö']
    u = ['ù', 'û', 'ü', ]
    y = ['ý', 'ÿ']
    
    # Remove html labels and add linebreaks
    text = text.replace('<p>', '')
    text = text.replace('</br></p>, ', ' \n ')
    text = text.replace('<br/>', ' \n ')
    text = text.replace('</p>, ', '</p>')
    text = text.replace('</p>', ' \n ')
    text = text.replace('<br>', ' \n ')
    text = text.replace(',', '')
    text = re.sub(r'(\(.*\)\s*)|(\[.*\]\s*)|(\{.*\}\s*)','',text)
    
    # Clean the text of strange characters
    for char in chars_to_clean:
        text = text.replace(char, '')
    for char in a:
        text = text.replace(char, 'á')
    for char in e:
        text = text.replace(char, 'é')
    for char in i:
        text = text.replace(char, 'í')
    for char in o:
        text = text.replace(char, 'ó')
    for char in u:
        text = text.replace(char, 'ú')
    for char in y:
        text = text.replace(char, 'y')
    
    text = text[:5000000]
    return text


def string_analysis(text):
    print("========== Corpus summary ==========")
    
    # Total number of characters
    print('Corpus length in characters:', len(text))
    
    # Total number of words
    text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
    print('Corpus length in words:', len(text_in_words))
    
    # Number of unique characters
    chars = sorted(list(set(text)))
    print('Total chars:', len(chars))
    
    return chars
    

def char_dictionaries(chars):
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char


def cut_text(text, MAXLEN, STEP):
    # Cut the text in semi-redundant sequences of maxlen characters
    print("\n========== Text cut ==========")
    sentences = []
    next_chars = []
    for i in range(0, len(text) - MAXLEN, STEP):
        sentences.append(text[i: i + MAXLEN])
        next_chars.append(text[i + MAXLEN])
    print('Number of sequences:', len(sentences))
    return sentences, next_chars


def vectorize(sentences, next_chars, chars, MAXLEN, char_indices):
    print('\n========== Vectorization... ==========')
    X = np.zeros((len(sentences), MAXLEN, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
    return X, y


def build_model(MAXLEN, chars):
    # Build the model: a single LSTM
    print('\n========== Building the model... ==========')
    
    # Define the Keras model
    model = Sequential()
    model.add(LSTM(128, input_shape=(MAXLEN, len(chars))))
    # model.add(Dropout(0.2)) # to prevent overfitting?
    model.add(Dense(len(chars), activation='softmax'))

    # Compile the Keras model
    optimizer = RMSprop(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    # model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return model


def sample(preds, temperature=1.0):
    # Helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: {}'.format(epoch))
    with io.open("../Output/examples_file.txt", 'a', encoding='utf8') as examples_file:
        examples_file.write('\n----- Generating text after Epoch: {}\n'.format(epoch))

        start_index = random.randint(0, len(text) - MAXLEN - 1)

        for diversity in [0.2, 0.5, 1.0, 1.2]:
            generated = ''
            sentence = text[start_index: start_index + MAXLEN]
            generated += sentence
            print('----- diversity:', diversity)
            examples_file.write('\n----- Diversity:' + str(diversity) + '\n')
            print('----- Generating with seed: "' + sentence + '"')
            examples_file.write('----- Generating with seed: "' + sentence + '"\n\n')
            sys.stdout.write(generated)
            examples_file.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, MAXLEN, len(chars)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_indices[char]] = 1.

                # Make probability predictions with the model
                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                examples_file.write(next_char)

                sys.stdout.flush()

            examples_file.write('\n')
            print()
        examples_file.write('='*80 + '\n')
        examples_file.flush()
    


if __name__ == "__main__":

    corpus = "../Input/lyrics.txt"
    text = get_text(corpus)
    text = clean_text(text)
    chars = string_analysis(text)
    char_indices, indices_char = char_dictionaries(chars)
    
    # Parameters: change to experiment different configurations
    MAXLEN = 40
    STEP = 3
    BATCH_SIZE = 128
    
    sentences, next_chars = cut_text(text, MAXLEN, STEP)
    X, y = vectorize(sentences, next_chars, chars, MAXLEN, char_indices)
    model = build_model(MAXLEN, chars)
    
    file_path = "../Checkpoints/weights.{epoch:02d}.hdf5"
    checkpointer = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=False)
    print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
    #early_stopping = EarlyStopping(monitor='val_acc', patience=5)
    callbacks_list = [checkpointer, print_callback]

    
    # Fit the keras model on the dataset
    model.fit(X, y,
              batch_size=BATCH_SIZE,
              epochs=25,
              callbacks=callbacks_list)

    # Save model and architecture to single file
    model.save("../Output/model.h5")
    print("\nSaved model to disk as model.h5")
    
    # Summarize model.
    print('\n\n========== Model summary ==========\n')
    print(model.summary())

Corpus length in characters: 14300327
Corpus length in words: 3089778
Total chars: 45

Number of sequences: 4766763


Epoch 1/25
  36480/4766763 [..............................] - ETA: 1:24:48 - loss: 2.2488 - accuracy: 0.3429

KeyboardInterrupt: 