Instalación e importación de librerías

In [47]:
!pip install lightfm
!pip install psutil



In [None]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sklearn.preprocessing import normalize
from scipy.sparse import coo_matrix, csr_matrix, lil_matrix
from sklearn.model_selection import train_test_split
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
import kagglehub
from transformers import SiglipModel, SiglipProcessor
import torch
import requests
from PIL import Image
import os
import psutil
import time
import pickle
import joblib
from joblib import Parallel, delayed
from sklearn.decomposition import TruncatedSVD

Descargar datos y filtrarlos

In [None]:
path = kagglehub.dataset_download("threnjen/board-games-database-from-boardgamegeek/versions/4")
print("Path to dataset files:", path)

%cd /root/.cache/kagglehub/datasets/threnjen/board-games-database-from-boardgamegeek/versions/4

games_df = pd.read_csv("games.csv")
user_ratings_df = pd.read_csv("user_ratings.csv")

games_df_filtered = games_df[["BGGId", "Description", "ImagePath"]].dropna().head(100)
texts = games_df_filtered["Description"].tolist()
image_paths = games_df_filtered["ImagePath"].tolist()

print("Ejemplo de descripciones:")
print(texts[:5])
print("\nEjemplo de rutas de imágenes:")
print(image_paths[:5])

Path to dataset files: /root/.cache/kagglehub/datasets/threnjen/board-games-database-from-boardgamegeek/versions/4
/root/.cache/kagglehub/datasets/threnjen/board-games-database-from-boardgamegeek/versions/4
Ejemplo de descripciones:
['die macher game seven sequential political race different region germany player charge national political party manage limited resource help party victory win party victory point regional election different way score victory point regional election supply eighty victory point depend size region party second party win regional election medium influence region party receive mediacontrol victory point party national party membership grow game progress supply fair number victory point lastly party score victory point party platform match national opinion end gamethe   edition feature party old west germany support   player   edition support player reunite germany update feature rule     edition support player add short fiveround variant additional rule update

Generación de embeddings

In [50]:
model_name = "google/siglip-base-patch16-224"
model = SiglipModel.from_pretrained(model_name)
processor = SiglipProcessor.from_pretrained(model_name)

Cargar embeddings

In [52]:
print("Cargando embeddings pre-generados desde archivos .pt...")
siglip_text_embeds = torch.load("/content/SIGLIP_text_embeddings.pt")
siglip_image_embeds = torch.load("/content/SIGLIP_image_embeddings.pt")
siglip_multimodal_embeds = torch.load("/content/SIGLIP_multimodal_embeddings.pt")

print("Embeddings de texto cargados:", siglip_text_embeds.shape)
print("Embeddings de imágenes cargados:", siglip_image_embeds.shape)
print("Embeddings multimodales cargados:", siglip_multimodal_embeds.shape)

Cargando embeddings pre-generados desde archivos .pt...
Embeddings de texto cargados: torch.Size([100, 768])
Embeddings de imágenes cargados: torch.Size([100, 768])
Embeddings multimodales cargados: torch.Size([100, 768])


  siglip_text_embeds = torch.load("/content/text_embeddings.pt")
  siglip_image_embeds = torch.load("/content/image_embeddings.pt")
  siglip_multimodal_embeds = torch.load("/content/multimodal_embeddings.pt")


Normalizar embeddings

In [None]:
text_embeds = normalize(siglip_text_embeds.numpy()).astype(np.float32).copy() 
image_embeds = normalize(siglip_image_embeds.numpy()).astype(np.float32).copy() 
multimodal_embeds = normalize(siglip_multimodal_embeds.numpy()).astype(np.float32).copy() 

In [None]:
def normalize_embeddings(embeds, target_dim=128):
    svd = TruncatedSVD(n_components=target_dim, random_state=42)
    return svd.fit_transform(embeds)

text_embeds = normalize_embeddings(text_embeds)
image_embeds = normalize_embeddings(image_embeds)
multimodal_embeds = normalize_embeddings(multimodal_embeds)

Mapear juegos y usuarios a índices y consruir características de ítems alineadas con game mapping

In [None]:
game_mapping = {game: idx for idx, game in enumerate(games_df_filtered["BGGId"])}
user_mapping = {user: idx for idx, user in enumerate(user_ratings_df["Username"].unique())}

filtered_ratings = user_ratings_df[user_ratings_df["BGGId"].isin(games_df_filtered["BGGId"])]

interactions = coo_matrix((
    filtered_ratings["Rating"].values,
    (
        filtered_ratings["Username"].map(user_mapping),
        filtered_ratings["BGGId"].map(game_mapping)
    )
)).tocsr().astype(np.float32).copy()

In [None]:
def build_item_features(embeddings, game_mapping):
    num_items = len(game_mapping)
    embedding_dim = embeddings.shape[1]

    features = lil_matrix((num_items, embedding_dim), dtype=np.float32)
    for game_id, idx in game_mapping.items():
        if idx < len(embeddings):
            features[idx, :] = embeddings[idx]

    return csr_matrix(features).copy() 

text_features = build_item_features(text_embeds, game_mapping)
image_features = build_item_features(image_embeds, game_mapping)
multimodal_features = build_item_features(multimodal_embeds, game_mapping)

Definir funciones para el cálculo de métricas

In [67]:
game_themes_df = pd.read_csv("themes.csv")
user_ratings_df = pd.read_csv("user_ratings.csv")

category_columns = game_themes_df.columns[1:]

item_categories = {}
for _, row in game_themes_df.iterrows():
    game_id = row["BGGId"]
    categories = [category for category in category_columns if row[category] == 1]
    item_categories[game_id] = "|".join(categories)

print(f"Ejemplo de categorías de ítems: {list(item_categories.items())[:5]}")

def calculate_items_popularity(user_ratings_df):
    item_counts = Counter(user_ratings_df["BGGId"])
    total_ratings = sum(item_counts.values())
    items_popularity = {item: count / total_ratings for item, count in item_counts.items()}
    return items_popularity

items_popularity = calculate_items_popularity(user_ratings_df)

Ejemplo de categorías de ítems: [(1, 'Economic|Political'), (2, 'Fantasy'), (3, 'Medieval|Theme_Samurai'), (4, 'Ancient'), (5, 'Economic')]


In [None]:
def evaluate_user(model, user_id, interactions, item_features, k=10, index_to_item_id=None, item_categories=None, items_popularity=None):
    positive_items = interactions.tocsr()[user_id].indices
    if len(positive_items) == 0:
        return (0.0, 0.0, 0.0, 0.0, 0.0)

    user_id = int(user_id) 
    scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
    ranked_items = np.argsort(-scores)[:k]

    top_k_item_ids = [index_to_item_id[item_idx] for item_idx in ranked_items]

    ap = 0.0
    ndcg = 0.0
    num_hits = 0.0
    for i, item in enumerate(ranked_items):
        if item in positive_items:
            num_hits += 1
            ap += num_hits / (i + 1)
            ndcg += 1 / np.log2(i + 2)
    ap /= min(k, len(positive_items))
    ndcg /= min(k, len(positive_items))
    recall = num_hits / len(positive_items)

    div = diversity(top_k_item_ids, item_categories)
    nov = novelty(top_k_item_ids, items_popularity)

    return (ap, ndcg, recall, div, nov)

def evaluate_ranking_lightfm_model(model, interactions, item_features=None, k=10, num_users_to_evaluate=50, num_jobs=-1):
    num_users, num_items = interactions.shape
    mean_ap = 0.0
    mean_ndcg = 0.0
    mean_recall = 0.0
    total_diversity = 0.0
    total_novelty = 0.0
    users_with_positive = 0
    index_to_item_id = {idx: item_id for item_id, idx in game_mapping.items()}
    if num_users > num_users_to_evaluate:
        sampled_user_ids = np.random.choice(num_users, size=num_users_to_evaluate, replace=False)
    else:
        sampled_user_ids = np.arange(num_users)

    def evaluate_single_user(user_id):
        return evaluate_user(model, user_id, interactions, item_features, k, index_to_item_id, item_categories, items_popularity)

    results = Parallel(n_jobs=num_jobs, backend='threading')(
    delayed(evaluate_single_user)(user_id) for user_id in sampled_user_ids
)
    for ap, ndcg, recall, div, nov in results:
        if ap > 0 or ndcg > 0 or recall > 0:
            mean_ap += ap
            mean_ndcg += ndcg
            mean_recall += recall
            total_diversity += div
            total_novelty += nov
            users_with_positive +=1

    if users_with_positive > 0:
        mean_ap /= users_with_positive
        mean_ndcg /= users_with_positive
        mean_recall /= users_with_positive
        avg_diversity = total_diversity / users_with_positive
        avg_novelty = total_novelty / users_with_positive
    else:
        mean_ap, mean_ndcg, mean_recall = 0.0, 0.0, 0.0
        avg_diversity, avg_novelty = 0.0, 0.0

    return mean_ap, mean_ndcg, mean_recall, avg_diversity, avg_novelty

def diversity(recommended_items, item_categories):
    unique_categories = set()
    for item in recommended_items:
        categories = item_categories.get(item, "unknown").split("|")
        unique_categories.update(categories)
    return len(unique_categories) / len(recommended_items) if recommended_items else 0

def novelty(recommended_items, items_popularity):
    novelty_score = 0.0
    for item in recommended_items:
        popularity = items_popularity.get(item, 1e-9)
        novelty_score += -np.log2(popularity)
    return novelty_score / len(recommended_items) if recommended_items else 0

Funciones para guardar y cargar modelo

In [59]:
def save_model(model, filename, drive_path="/content/drive/MyDrive/models/"):
    os.makedirs(drive_path, exist_ok=True)
    joblib.dump(model, os.path.join(drive_path, filename))
    print(f"Model saved to {drive_path}{filename}")

def load_model(filename, drive_path="/content/drive/MyDrive/models/"):
    return joblib.load(os.path.join(drive_path, filename))

Entrenamiento del modelo

In [None]:
def train_and_evaluate(interactions, item_features, name, save=True, num_users_to_evaluate=50):
    start_time = time.time()
    model = LightFM(loss='warp', random_state=42)
    model.fit(interactions, item_features=item_features, epochs=10, num_threads=4)
    training_time = time.time() - start_time
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    memory_used = memory_info.rss / (1024 * 1024) 
    cpu_usage = psutil.cpu_percent(interval=1)

    if save:
        save_model(model, f"{name}_model.pkl")

    precision = precision_at_k(model, interactions, k=10).mean()
    recall_lightfm = recall_at_k(model, interactions, k=10).mean()
    mean_ap, mean_ndcg, mean_recall, avg_diversity, avg_novelty = evaluate_ranking_lightfm_model(
        model, interactions, item_features, k=10, num_users_to_evaluate=num_users_to_evaluate, num_jobs=-1
    )

    print(f"{name} Metrics:")
    print(f"Training Time: {training_time:.2f} seconds")
    print(f"Memory Used: {memory_used:.2f} MB")
    print(f"CPU Usage: {cpu_usage:.2f}%")
    print(f"Precision@10: {precision}")
    print(f"Recall@10 (LightFM): {recall_lightfm}")
    print(f"MAP@10: {mean_ap}")
    print(f"NDCG@10: {mean_ndcg}")
    print(f"Recall@10 (Custom): {mean_recall}")
    print(f"Diversity@10: {avg_diversity}")
    print(f"Novelty@10: {avg_novelty}")
    print()


Entrenar  evaluar modelo

In [None]:
train_and_evaluate(interactions, text_features, "Siglip_TextEmbeddings", num_users_to_evaluate=50)
train_and_evaluate(interactions, image_features, "Siglip_ImageEmbeddings", num_users_to_evaluate=50)
train_and_evaluate(interactions, multimodal_features, "Siglip_MultimodalEmbeddings", num_users_to_evaluate=50)

Model saved to /content/drive/MyDrive/models/Siglip_TextEmbeddings_model.pkl
precidion y recall ok
Siglip_TextEmbeddings Metrics:
Training Time: 71.92 seconds
Memory Used: 3384.44 MB
CPU Usage: 14.60%
Precision@10: 0.034672897309064865
Recall@10 (LightFM): 0.10562329143111987
MAP@10: 0.09491798941798943
NDCG@10: 0.14972820810766632
Recall@10 (Custom): 0.34079365079365076
Diversity@10: 1.0
Novelty@10: 13.452978115941514

Model saved to /content/drive/MyDrive/models/Siglip_ImageEmbeddings_model.pkl
precidion y recall ok
Siglip_ImageEmbeddings Metrics:
Training Time: 72.24 seconds
Memory Used: 3401.87 MB
CPU Usage: 3.00%
Precision@10: 0.0352298803627491
Recall@10 (LightFM): 0.09963953854171333
MAP@10: 0.10615079365079365
NDCG@10: 0.11897457737129973
Recall@10 (Custom): 0.2205387205387205
Diversity@10: 1.0
Novelty@10: 13.452978115941514

Model saved to /content/drive/MyDrive/models/Siglip_MultimodalEmbeddings_model.pkl
precidion y recall ok
Siglip_MultimodalEmbeddings Metrics:
Training Tim

Bibliografía:

Referencia a la documentación oficial:

LightFM. LightFM Documentation. Disponible en: https://making.lyst.com/lightfm/docs/home.html

Referencia al repositorio en GitHub:

LightFM. LightFM GitHub Repository. Disponible en: https://github.com/lyst/lightfm

Referencia al repositorio de GitHub del práctico

https://github.com/PUC-RecSys-Class/RecSysPUC-2024-2/blob/master/practicos/pr%C3%A1ctico_m%C3%A9tricas.ipynb
