Este es el notebook donde se define y procesa el modelo de recomendación, sobre la base del uso de la distancia coseno

In [1]:
!pip install scikit-learn

[0m

In [12]:
# Imports
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("stopwords")
import etl_flow as etlflow
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Variables de ambiente (Para procesamiento en ambiente de desarrollo)
%env DIRECTORIO_RAIZ=/com.docker.devenvironments.code

env: DIRECTORIO_RAIZ=/com.docker.devenvironments.code


In [4]:
# Recupero el movies dataset preprocesado
m_df = etlflow.obtener_df_preprocesado('m_df')

In [5]:
# Eliminar las columnas no utilizadas. Conservar solo id y overview
m_df = m_df[['id', 'overview']]

A continuación, las funciones que permiten el preprocesamiento del modelo de recomendacion

In [16]:
# Elimina signos de puntuacion de una lista tokenizada
def limpia_signos_de_puntuacion(lista_tokens: list):
    token_rta = []
    for palabra in lista_tokens:
        for letra in palabra:
            if letra in string.punctuation:
                palabra=palabra.replace(letra,"")
        token_rta.append(palabra)
    return token_rta

# Elimina numeros
def limpia_numeros(lista_tokens: list):
    token_rta = []
    for palabra in lista_tokens:
        for letra in palabra:
            if letra in string.digits:
                palabra=palabra.replace(letra,"")
        token_rta.append(palabra)
    return token_rta

# Elimina tokens vacios
def elimina_tokens_vacios(lista_tokens: list):
    token_rta = []
    for palabra in lista_tokens:
        if palabra != "":
            token_rta.append(palabra)
    return token_rta

# Pasa los tokens a minusculas
def pasa_tokens_a_minusculas(lista_tokens: list):
    token_rta = []
    for palabra in lista_tokens:
        token_rta.append(palabra.lower())
    return token_rta

# Elimina tokens cortos
def elimina_tokens_cortos(lista_tokens: list):
    token_rta = []
    for palabra in lista_tokens:
        if len(palabra)>=3:
            token_rta.append(palabra)
    return token_rta

# Elimina stop words
def elimina_stop_words(lista_tokens: list):
    a=set(stopwords.words('english'))
    token_rta = [palabra for palabra in lista_tokens if palabra not in a]
    return token_rta

# Tokeniza el texto y limpia la lista tokenizada
def tokenize_and_clean(texto: str):
    token = word_tokenize(texto) # Tokenizo
    token = limpia_signos_de_puntuacion(token) # Limpio signos de puntuacion
    token = limpia_numeros(token) # Limpio números
    token = elimina_tokens_vacios(token) # Elimino tokens vacios
    token = pasa_tokens_a_minusculas(token) # Paso los tokens a minusculas
    token = elimina_tokens_cortos(token) # Elimino tokens cortos
    token = elimina_stop_words(token) # Elimino stop words del Inglés, de la lista de tokens
    return token

# Tokeniza, limpia y vuelve a armar en forma de string
def tokenizar_limpiar_y_obtener_string(texto: str):
    token_limpio = tokenize_and_clean(texto)
    t_str = ' '.join(token_limpio)
    return t_str

# Serializador / Deserializador de objetos
serializados_d = {'mtx_tfidf' : ['src/preproc/mtx_tfidf.tsv', '\t']}

def serializar(nombre_objeto: str, objeto: any):
       
       if nombre_objeto in serializados_d.keys():

              # Obtengo el directorio raiz desde la variable de entorno DIRECTORIO_RAIZ
              dir_raiz = os.getenv("DIRECTORIO_RAIZ")

              # Escribo el objeto al archivo correspondiente
              path_archivo = os.path.join(dir_raiz, serializados_d[nombre_objeto][0])
              with open(path_archivo, "wb") as archivo:
                  pickle.dump(objeto, archivo)

       else:

              # Error. El serializable no esta en la lista de serializables
              print('Error: El serializable no esta en la lista de serializables preprocesados')

def deserealizar(nombre_objeto: str):

       if nombre_objeto in serializados_d.keys():

              # Obtengo el directorio raiz desde la variable de entorno DIRECTORIO_RAIZ
              dir_raiz = os.getenv("DIRECTORIO_RAIZ")

              # Obtengo el objeto desde el archivo correspondiente
              objeto = None
              path_archivo = os.path.join(dir_raiz, serializados_d[nombre_objeto][0])
              with open(path_archivo, "rb") as archivo:
                   objeto = pickle.load(archivo)

              return objeto
       
       else:

              # Error. El objeto no ha sido preprocesado
              print('Error: El objeto no esta en la lista de serializables preprocesados')
              return None

In [7]:
# Elimino registros con valores nulos en la columna de overview
m_df.dropna(subset=['overview'], inplace=True)
m_df.reset_index(drop=True, inplace=True)

In [8]:
# Tokenizo, limpio y rearmo los overviews
m_df['cleansed_overview'] = m_df['overview'].apply(tokenizar_limpiar_y_obtener_string)

In [9]:
# Instancio un count vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [10]:
# Armo la matriz Tf-Idf
documentos = m_df['cleansed_overview']
tfidf_matrix = tfidf_vectorizer.fit_transform(documentos)

In [14]:
serializar('mtx_tfidf', tfidf_matrix)

In [17]:
tfidf_matrix2 = deserealizar('mtx_tfidf')

In [18]:
# Pruebo con un caso
# Step 5: Choose a query document
query_document = "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

# Step 6: Find similar documents
query_tfidf = tfidf_vectorizer.transform([query_document])
similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix2)
similarity_scores = similarity_scores.flatten()  # Convert to 1D array
related_documents_indices = similarity_scores.argsort()[::-1]  # Sort indices in descending order

# Top N similar documents
top_n = 5
top_documents = [documentos[index] for index in related_documents_indices[:top_n]]

print(related_documents_indices)
print(top_documents)

[    0 15271  2976 ... 28123 28121 22210]
['led woody andy toys live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plots buzz circumstances separate buzz woody owner duo eventually learns put aside differences', 'woody buzz rest andy toys played years andy college gang find accidentally left nefarious day care center toys must band together escape return home andy', 'andy heads cowboy camp leaving toys devices things shift high gear obsessive toy collector named mcwhiggen owner toy barn kidnaps woody andy toys mount daring rescue mission buzz lightyear meets match woody decide heart truly belong', 'andy stitzer pleasant life nice apartment job stamping invoices electronics store age one thing andy done really bothering sexobsessed male coworkers andy still virgin determined help andy get laid guys make mission devirginize seems hopeless andy meets small business owner trish single mom', 'fast food restaurant mini variant buzz forcibly 

In [11]:
# Veo la matriz en formato pandas (BORRAR esta celda luego)
feature_names = tfidf_vectorizer.get_feature_names_out()
pd.DataFrame(tfidf_matrix.toarray(), columns = feature_names)

Unnamed: 0,aaa,aaaron,aabgamma,aachan,aachi,aackerlund,aadhavan,aadland,aaicha,aakash,...,ரமண,శమ,แพร,たけみかずち,ようなもの,患者さんとその世界,주식회사,첫사랑,ﬁrst,ﬁve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
m_df

Unnamed: 0,id,overview,cleansed_overview
0,862,"Led by Woody, Andy's toys live happily in his ...",led woody andy toys live happily room andy bir...
1,8844,When siblings Judy and Peter discover an encha...,siblings judy peter discover enchanted board g...
2,15602,A family wedding reignites the ancient feud be...,family wedding reignites ancient feud nextdoor...
3,31357,"Cheated on, mistreated and stepped on, the wom...",cheated mistreated stepped women holding breat...
4,11862,Just when George Banks has recovered from his ...,george banks recovered daughter wedding receiv...
...,...,...,...
44417,30840,"Yet another version of the classic epic, with ...",yet another version classic epic enough variat...
44418,111109,An artist struggles to finish his work while a...,artist struggles finish work storyline cult pl...
44419,67758,"When one of her hits goes wrong, a professiona...",one hits goes wrong professional assassin ends...
44420,227506,"In a small town live two brothers, one a minis...",small town live two brothers one minister one ...


In [14]:
m_df.loc[0, 'overview']

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."