## Generador de títulos con Redes Neuronales

### 1.1 Importamos las librerías necesarias 

In [9]:
import pandas as pd
import string
import numpy as np
import json

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku

In [11]:
import tensorflow as tf
tf.random.set_seed(2)
from numpy.random import seed
seed(1)

### 1.2 Importar la data necesaria

In [12]:
#load all the datasets 
df1 = pd.read_csv('/Users/erickavendanogarcia/Downloads/USvideos.csv')
df2 = pd.read_csv('/Users/erickavendanogarcia/Downloads/CAvideos.csv')
df3 = pd.read_csv('/Users/erickavendanogarcia/Downloads/GBvideos.csv')

#load the datasets containing the category names
data1 = json.load(open('/Users/erickavendanogarcia/Downloads/US_category_id.json'))
data2 = json.load(open('/Users/erickavendanogarcia/Downloads/CA_category_id.json'))
data3 = json.load(open('/Users/erickavendanogarcia/Downloads/GB_category_id.json'))

### 1.3 Definición de algunas funciones para realizar la limpieza de datos

In [13]:
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    i_d = list(map(int, i_d))
    category = zip(i_d, title)
    category = dict(category)
    return category

#create a new category column by mapping the category names to their id
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

#join the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)

#drop rows based on duplicate videos
df = df.drop_duplicates('video_id')

#collect only titles of entertainment videos
#feel free to use any category of video that you want
entertainment = df[df['category_title'] == 'Entertainment']['title']
entertainment = entertainment.tolist()


#remove punctuations and convert text to lowercase
# Función para limpiar texto: elimina signos de puntuación, pasa a minúsculas y elimina caracteres no ASCII
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text

corpus = [clean_text(e) for e in entertainment]

In [14]:
# Tokenizer global para ser usado en funciones
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [15]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max(len(x) for x in input_sequences)
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [16]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1  # secuencia sin la palabra objetivo
    model = Sequential()

    # Capa de embedding: convierte tokens en vectores densos de 10 dimensiones
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Capa LSTM oculta: con 100 unidades
    model.add(LSTM(100))
    model.add(Dropout(0.1))  # Previene overfitting

    # Capa de salida: predice la siguiente palabra entre total_words opciones
    model.add(Dense(total_words, activation='softmax'))

    # Compila el modelo con entropía cruzada y optimizador Adam
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model


lstm_model = create_model(max_sequence_len, total_words)
lstm_model.summary()
# Crear y entrenar el modelo
model = create_model(max_sequence_len, total_words)
model.fit(predictors, label, epochs=100, verbose=5)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.history.History at 0x149bc5190>

In [17]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Método actualizado para predecir la clase con Keras moderno
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)[0]

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text.title()


In [23]:
generate_text('film', 5, model, max_sequence_len)

'Film Moms Lilly Vs Logan Not'