It is highly recommended to use a powerful **GPU**, you can use it for free uploading this notebook to [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb).
<table align="center">
 <td align="center"><a target="_blank" href="https://colab.research.google.com/github/ezponda/intro_deep_learning/blob/main/class/RNN/Character-level_text_generation_with_RNN.ipynb">
        <img src="https://colab.research.google.com/img/colab_favicon_256px.png"  width="50" height="50" style="padding-bottom:5px;" />Run in Google Colab</a></td>
  <td align="center"><a target="_blank" href="https://github.com/ezponda/intro_deep_learning/blob/main/class/RNN/Character-level_text_generation_with_RNN.ipynb">
        <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png"  width="50" height="50" style="padding-bottom:5px;" />View Source on GitHub</a></td>
</table>

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
import random

### Load the data


In [None]:
## download the dataser
# quijote : https://www.gutenberg.org/files/2000/2000-0.txt
'''path = keras.utils.get_file(
    "nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt"
)'''
path = keras.utils.get_file(
    "quijote_spanish.txt", origin="https://www.gutenberg.org/files/2000/2000-0.txt"
)

In [None]:
text = open(path).read().lower()
## don quijote
text = text[39972:]
print('corpus length:', len(text))
print('corpus words:', len(text.split(' ')))
# text = text[:100000]

In [None]:
# print the firsts characters
#print(text[:200])
# remove newlines chars 
text = text.replace("\n", " ").replace("  ", " ").strip()  
print()
print('processed texts:')
print()
print(text[:200])

### Text simple processing

In [None]:
chars = sorted(set(text))
print("Total chars:", len(chars))

char_indices = {c:i for i, c in enumerate(chars)}
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
from collections import Counter
char_counts = Counter(text)
char_counts_sort = [
    (ch, count)
    for ch, count in sorted(char_counts.items(), key=lambda x: -x[1])
]
print('Most frequent characters:', char_counts_sort[:10])
print('less frequent characters:', char_counts_sort[-10:])

In [None]:
max_chars = 35
## We replace the less used characters with unknown_char
unknown_char = 'ò'

chars = {ch for ch,count in char_counts_sort[:max_chars-1]}
print(unknown_char in chars)
char_indices = {c:i+1 for i, c in enumerate(chars)}
char_indices[unknown_char] = 0
indices_char = {i:c for c,i in char_indices.items()}
chars.add(unknown_char)

In [None]:
# reduce the size
text = text[:200000]

Next we generate the input and output arrays:

The input will consist on sentences of a fixed (maxlen) lenght, while the outputs will be the next characters in the text.

So, if the text is "Welcome to deep learning course" with maxlen = 5, we will have:

Input = [ w, e, l, c, o, e, l, c, o, m, l, c, o, m, e, ... ] Output = [ m, e, , ... ]
In order to avoid overfitting (and improve performances) we can add a step to the structure so that with step = 3, for example:

Input = [ w, e, l, c, o, c, o, m, e, , m, e, , t, o, ... ] Output = [ m, t, , ... ]

In [None]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

In [None]:
sentences[:6]

In [None]:
next_chars[:6]

In [None]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros(len(sentences), dtype=np.int32)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices.get(char, 0)] = 1
    y[i] = char_indices.get(next_chars[i], 0)
print(x.shape, y.shape)

## Build the model: a single LSTM layer


In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
model.compile(loss="sparse_categorical_crossentropy",
              optimizer='adam', metrics=['accuracy'])

In [None]:
def sample(preds, temperature=0.2):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
epochs = 300
batch_size = 256

epoch = 0
for epoch_ind in range(int(epochs/10)):
    if epoch_ind < 4:
        epoch += 1
        model.fit(x, y, batch_size=batch_size, epochs=1)
    else:
        epoch += 10
        model.fit(x, y, batch_size=batch_size, epochs=10)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0]:
        print("...Diversity:", diversity)
        generated = ""
        sentence = text[start_index: start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')
        for i in range(150):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices.get(char, 0)] = 1.0
            preds = model(x_pred).numpy()[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()

### Sabina

In [None]:
#!pip install bs4

In [None]:
import requests
import bs4
from bs4 import BeautifulSoup

In [None]:
url = 'https://www.letras.com/joaquin-sabina/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
print(url)

In [None]:
def check_is_paragraph(row):
    return all([
        b.name == 'br' or type(b) == bs4.element.NavigableString
        for b in row.contents
    ])

def get_paragraph_text_0(row):
    sentences = []
    for b in row.contents:
        phrase = ''
        if type(b) == bs4.element.NavigableString:
            phrase = b.strip()
        elif b.name == 'br':
            phrase = b.get_text().strip()
        if phrase:
            sentences.append(phrase)
    return sentences

def get_paragraph_text(row):
    sentences = []
    for b1 in row.contents:
        phrase = ''
        if type(b1) == bs4.element.NavigableString:
            phrase = b1.strip()
        elif len(b1.contents) > 1 and check_is_paragraph(b1):
            phrase = ' '.join(get_paragraph_text_0(b1))
        elif b1.name == 'br':
            phrase = b1.get_text().strip()
        if phrase:
            sentences.append(phrase)
    return sentences

def get_song(song_soup):
    first = False
    song = []
    for i, row in enumerate(song_soup.findAll('p')):
        is_paragraph = check_is_paragraph(row)
        if not first and is_paragraph:
            first = True
        if first and not is_paragraph:
            break
        if is_paragraph:
            paragraph = get_paragraph_text(row)
            song += paragraph
    return '\n'.join(song)

In [None]:
complete_songs = []
all_rows = soup.findAll('a', {'class':"song-name"}, href=True)
for row in all_rows:
    song_url = 'https://www.letras.com' + row['href']
    song_page = requests.get(song_url)
    song_soup = BeautifulSoup(song_page.text, 'html.parser')
    song = get_song(song_soup)
    print('######################')
    print(song_url)
    print(song)
    complete_songs.append(song)

In [None]:
text_sabina = ' '.join(complete_songs).replace('\n', ' ').replace('  ', ' ').lower()

In [None]:
len(text_sabina)

In [None]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3

def get_sentences(text, maxlen, step):
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    print("Number of sequences:", len(sentences))
    return sentences, next_chars

def preprocess_text(sentences, chars, char_indices):
    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros(len(sentences), dtype=np.int32)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices.get(char, 0)] = 1
        y[i] = char_indices.get(next_chars[i], 0)
    return x, y

In [None]:
sentences, next_chars = get_sentences(text_sabina, maxlen, step)
x, y = preprocess_text(sentences, chars, char_indices)

In [None]:
x.shape, y.shape

### Continues with the songs of Sabina with the model trained with Don Quixote

In [None]:
def continue_sentence(model, sentence, sentence_length, char_indices, maxlen, chars, diversity=0.2):
    generated = ""
    for i in range(sentence_length):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices.get(char, 0)] = 1.0
        preds = model(x_pred).numpy()[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        sentence = sentence[1:] + next_char
        generated += next_char
    print("...Generated: ", generated)
    return generated

In [None]:
ind = np.random.randint(len(sentences))
sentence = sentences[ind]
sentence

In [None]:
generated = continue_sentence(model, sentence, 50, char_indices, maxlen, chars, diversity=0.2)

In [None]:
model_sabina= keras.models.clone_model(model)
model_sabina.set_weights(model.get_weights())
model_sabina.compile(loss="sparse_categorical_crossentropy",
              optimizer='adam', metrics=['accuracy'])

In [None]:
epochs = 100
batch_size = 128

epoch = 0
for epoch_ind in range(int(epochs/5)):
    if epoch_ind <= 2:
        model_sabina.fit(x, y, batch_size=1024 * 8, epochs=1)
    elif epoch_ind < 10:
        epoch += 1
        model_sabina.fit(x, y, batch_size=batch_size, epochs=1)
    else:
        epoch += 5
        model_sabina.fit(x, y, batch_size=batch_size, epochs=5)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text_sabina) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0]:
        print("...Diversity:", diversity)
        generated = ""
        sentence = text_sabina[start_index: start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')
        for i in range(250):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices.get(char, 0)] = 1.0
            preds = model_sabina(x_pred).numpy()[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()

## Practice: Create a model with regularization and compare the results only with the corpus of Sabina

In [None]:
model_sabina = keras.Sequential()
model_sabina.add(keras.Input(shape=(maxlen, len(chars))))
model_sabina.add(...)
model_sabina.add(layers.Dense(len(chars), activation=...))
model_sabina.compile(loss="sparse_categorical_crossentropy",
              optimizer='adam', metrics=['accuracy'])

In [None]:
epochs = 150
batch_size = 128

epoch = 0
for epoch_ind in range(int(epochs/5)):
    if epoch_ind < 5:
        epoch += 1
        model_sabina.fit(x, y, batch_size=batch_size, epochs=1)
    else:
        epoch += 5
        model_sabina.fit(x, y, batch_size=batch_size, epochs=5)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text_sabina) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0]:
        print("...Diversity:", diversity)
        generated = ""
        sentence = text_sabina[start_index: start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')
        for i in range(250):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices.get(char, 0)] = 1.0
            preds = model_sabina(x_pred).numpy()[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()

### References
[https://keras.io/examples/generative/lstm_character_level_text_generation/](https://keras.io/examples/generative/lstm_character_level_text_generation/)