In [1]:
import tensorflow as tf
import numpy as np 
import docx2txt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import re
import string

In [2]:
data = docx2txt.process('carlos-drummond-de-andrade-poesia-completapdf.docx')

In [3]:
with open("output.txt", "w") as text_file:
    print(data, file=text_file)

In [4]:
# Load the dataset
data = open('./output.txt').read()

# Lowercase and split the text
corpus = data.lower().split("\n")


# Preview the result
print(corpus)

['let événements m’ennuieni', '', 'p. valer\\’', '', '', '', '', '', '', '', '', '', '', '', '', '', 'a américo facâ', '', '\t', '/ entre lobo e cao', '', '', '', '', '', 'dissolusao', '', '', '', 'escurece, e nao me seduz tatear sequer uma lampada. pois que aprouve ao dia findar, aceito a noite.', '', '', '', 'e com ela aceito que brote uma ordem outra de seres e coisas nio figuradas.', '', 'bra os cruzados.', '', 'vazio de quanto amavamos, mais vasto é o céu. povoa§ñes surgem do vacuo.', '', 'habito alguma?', '', 'e nem destaco minha pele da confluente escuridio.', '', 'um finn unanime concentra-se e pousa no ar. hesitando.', '', 'e aquele agressivo espirito que o dia carreia consigo,', '', 'ja nño oprime. assim a paz, destro§ada.', '', '', '', 'vai durar mil anos, on extinguir-se na cor do galo? esta rosa é definitiva,', '', 'ainda que pobre.', '', '', '', 'imaginapao, falsa demente, ja te desprezo. e tu, palavra.', '', '', '', '', '', '', '', 'z48', '', '', '', '', '', '', '', '', 

In [5]:
#corpus.remove('CARLOS')

In [6]:
# Initialize the Tokenizer class
tokenizer = Tokenizer()

# Generate the word index dictionary
tokenizer.fit_on_texts(corpus)

# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1

print(f'word index dictionary: {tokenizer.word_index}')
print(f'total words: {total_words}')

word index dictionary: {'e': 1, 'de': 2, 'a': 3, 'que': 4, 'o': 5, 'se': 6, 'em': 7, 'do': 8, 'um': 9, 'nao': 10, 'no': 11, 'é': 12, 'mais': 13, 'os': 14, 'na': 15, 'me': 16, 'da': 17, 'uma': 18, 'nos': 19, 'como': 20, 'as': 21, 'para': 22, 'mas': 23, 'ao': 24, 'nio': 25, 'amor': 26, 'sem': 27, 'ou': 28, 'nem': 29, 'i': 30, 'por': 31, 'dos': 32, 'dia': 33, 'tudo': 34, 'ja': 35, 'ser': 36, 'te': 37, 'tempo': 38, 'meu': 39, 'com': 40, 'mundo': 41, 'seu': 42, 'sobre': 43, '—': 44, 'nossa': 45, 'ha': 46, 'nada': 47, 'das': 48, 'nosso': 49, 'onde': 50, 'vai': 51, 'sua': 52, 'amar': 53, 'era': 54, 'sonho': 55, 'quem': 56, 'pois': 57, 'mim': 58, 'eu': 59, 'vida': 60, 'bem': 61, 'talvez': 62, 'born': 63, 'mesmo': 64, 'ele': 65, 'so': 66, 'todos': 67, 'assim': 68, 'esse': 69, 'essa': 70, 'la': 71, 'men': 72, 'sempre': 73, 'entre': 74, 'coisas': 75, 'tua': 76, 'lo': 77, 'morte': 78, 'ar': 79, 'teu': 80, 'foi': 81, 'tarde': 82, 'sombra': 83, 'terra': 84, 'noite': 85, 'si': 86, 'sei': 87, 'forma':

In [7]:
# Initialize the sequences list
input_sequences = []

# Loop over every line
for line in corpus:

	# Tokenize the current line
	token_list = tokenizer.texts_to_sequences([line])[0]

	# Loop over the line several times to generate the subphrases
	for i in range(1, len(token_list)):
		
		# Generate the subphrase
		n_gram_sequence = token_list[:i+1]

		# Append the subphrase to the sequences list
		input_sequences.append(n_gram_sequence)

# Get the length of the longest line
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create inputs and label by splitting the last token in the subphrases
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [8]:
# Get sample sentence
sentence = corpus[10].split()
print(f'sample sentence: {sentence}')

# Initialize token list
token_list = []

# Look up the indices of each word and append to the list
for word in sentence: 
  token_list.append(tokenizer.word_index[word])

# Print the token list
print(token_list)

sample sentence: []
[]


In [9]:
token_list

[]

In [10]:
# Pick element
elem_number = 5

# Print token list and phrase
print(f'token list: {xs[elem_number]}')
print(f'decoded to text: {tokenizer.sequences_to_texts([xs[elem_number]])}')

# Print label
print(f'one-hot label: {ys[elem_number]}')
print(f'index of label: {np.argmax(ys[elem_number])}')

token list: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   3 973]
decoded to text: ['a américo']
one-hot label: [0. 0. 0. ... 0. 0. 0.]
index of label: 974


In [11]:
# Pick element
elem_number = 4

# Print token list and phrase
print(f'token list: {xs[elem_number]}')
print(f'decoded to text: {tokenizer.sequences_to_texts([xs[elem_number]])}')

# Print label
print(f'one-hot label: {ys[elem_number]}')
print(f'index of label: {np.argmax(ys[elem_number])}')

token list: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
decoded to text: ['a']
one-hot label: [0. 0. 0. ... 0. 0. 0.]
index of label: 973


In [12]:
# Hyperparameters
embedding_dim = 100
lstm_units = 150
learning_rate = 0.01

# Build the model
model = Sequential([
          Embedding(total_words, embedding_dim, input_length=max_sequence_len-1),
          Bidirectional(LSTM(lstm_units)),
          Dense(total_words, activation='softmax')
])

# Use categorical crossentropy because this is a multi-class problem
model.compile(
    loss='categorical_crossentropy', 
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
    metrics=['accuracy']
    )

# Print the model summary
model.summary()

2022-11-12 12:50:37.596234: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 57, 100)           368800    
                                                                 
 bidirectional (Bidirectiona  (None, 300)              301200    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3688)              1110088   
                                                                 
Total params: 1,780,088
Trainable params: 1,780,088
Non-trainable params: 0
_________________________________________________________________


In [14]:
epochs = 10

# Train the model
history = model.fit(xs, ys, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

# Visualize the accuracy
plot_graphs(history, 'accuracy')

In [17]:
import gradio as gr

In [22]:
# Define seed text
#seed_text = "mundo mundo vasto mundo mas"
def predictor(seed_text):
# Define total words to predict
	next_words = 10

# Loop until desired length is reached
	for _ in range(next_words):

	# Convert the seed text to a token sequence
		token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
		token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	
	# Feed to the model and get the probabilities for each index
		probabilities = model.predict(token_list)

	# Get the index with the highest probability
		predicted = np.argmax(probabilities, axis=-1)[0]

	# Ignore if index is 0 because that is just the padding.
		if predicted != 0:
		
		# Look up the word associated with the index. 
			output_word = tokenizer.index_word[predicted]

		# Combine with the seed text
		seed_text += " " + output_word
	return seed_text

# Print the result	
#print(seed_text)

demo = gr.Interface(
    fn=predictor,
    inputs=gr.inputs.Textbox(lines=5, label="Input Text"),
    outputs=gr.outputs.Textbox(label="Generated Text"),
)

demo.launch(share=True)
  



Running on local URL:  http://127.0.0.1:7864
Running on public URL: https://30d0792e7f526be7.gradio.app

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x7f82f7f016a0>,
 'http://127.0.0.1:7864/',
 'https://30d0792e7f526be7.gradio.app')

In [None]:
# Define seed text
seed_text = "e agora Jose"

# Define total words to predict
next_words = 15

# Loop until desired length is reached
for _ in range(next_words):

	# Convert the seed text to a token sequence
  token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	
	# Feed to the model and get the probabilities for each index
  probabilities = model.predict(token_list)

  # Pick a random number from [1,2,3]
  choice = np.random.choice([1,2,3])
	
  # Sort the probabilities in ascending order 
  # and get the random choice from the end of the array
  predicted = np.argsort(probabilities)[0][-choice]

	# Ignore if index is 0 because that is just the padding.
  if predicted != 0:
		
		# Look up the word associated with the index. 
	  output_word = tokenizer.index_word[predicted]

		# Combine with the seed text
	  seed_text += " " + output_word

# Print the result	
print(seed_text)