## Generador de títulos con Redes Neuronales
El objetivo es hacer una mini prueba de este modelo y entrenar una red neuronal para poder hacer predicciones de títulos a los editores de Webedia LATAM basado en los títulos que han generado más vistas en estos últimos meses.

### 1.1 Importamos las librerías

In [2]:
import pandas as pd
import string
import numpy as np
import json


In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku

In [4]:
import tensorflow as tf
tf.random.set_seed(2)
from numpy.random import seed
seed(1)

### 1.2 Importamos la data necesaria

In [7]:
import re
import unicodedata

# Función para limpiar texto
def clean_text(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    text = text.lower()
    text = re.sub(r"[^a-zñ\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


df = pd.read_csv('/Users/erickavendanogarcia/Downloads/titulos.csv')

# Trabajar con la columna pageTitle
df['pageTitle'] = df['pageTitle'].astype(str).apply(clean_text)
corpus = df['pageTitle'].tolist()

In [8]:
def preprocess_titles(titles):
    return [clean_text(title) for title in titles]

### 1.3 Importamos algunas funciones 

In [9]:
# Tokenizar y generar secuencias
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

# Padding y codificación categórica
def generate_padded_sequences(input_sequences):
    max_sequence_len = max(len(x) for x in input_sequences)
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [57]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1  # secuencia sin la palabra objetivo
    model = Sequential()

    # Capa de embedding: convierte tokens en vectores densos de 10 dimensiones
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Capa LSTM oculta: con 100 unidades
    model.add(LSTM(100))
    model.add(Dropout(0.2))  # Previene overfitting

    # Capa de salida: predice la siguiente palabra entre total_words opciones
    model.add(Dense(total_words, activation='softmax'))

    # Compila el modelo con entropía cruzada y optimizador Adam
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model


lstm_model = create_model(max_sequence_len, total_words)
lstm_model.summary()
# Crear y entrenar el modelo
model = create_model(max_sequence_len, total_words)
model.fit(predictors, label, epochs=50, verbose=5)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.history.History at 0x15eade990>

In [58]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

        # Método actualizado para predecir la clase con Keras moderno
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)[0]

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text.title()


In [None]:
generate_text('Marvel', 5, model, max_sequence_len)