# 04 — Contenido y Embeddings de Sinopsis

Genera **embeddings de sinopsis** (si hay Internet y `sentence-transformers` disponible). 
Si no, recurre a **TF‑IDF** como fallback local.

In [None]:
import sys
from pathlib import Path
repo_root = Path().resolve().parent if Path.cwd().name == "notebooks" else Path().resolve()
sys.path.insert(0, str(repo_root))


In [None]:
import polars as pl
from pathlib import Path

content = pl.read_parquet(repo_root / "data" / "processed" / "content.parquet")
print("content:", content.shape, content.columns[:8])
text_col = "synopsis" if "synopsis" in content.columns else None


In [None]:
# Intentar embeddings con sentence-transformers, si no, TF-IDF
import numpy as np
emb = None
if text_col:
    try:
        from sentence_transformers import SentenceTransformer
        m = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        texts = content.select(text_col).to_series().fill_null("").to_list()
        emb = m.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
        print("ST embeddings:", emb.shape)
    except Exception as e:
        print("ST no disponible, usando TF-IDF. Motivo:", e)
        from sklearn.feature_extraction.text import TfidfVectorizer
        texts = content.select(text_col).to_series().fill_null("").to_list()
        vec = TfidfVectorizer(max_features=20000)
        emb = vec.fit_transform(texts).astype("float32")
        print("TF-IDF matrix:", emb.shape)
else:
    print("No hay columna de sinopsis; salta a siguiente paso.")


In [None]:
# Guardar índice simple de similitud (no pesado): se guardan embeddings/TFIDF
import pickle, scipy
out_dir = repo_root / "models"
out_dir.mkdir(parents=True, exist_ok=True)
with open(out_dir / "content_embeddings.pkl", "wb") as f:
    pickle.dump({"emb": emb, "text_col": text_col}, f)
print("Embeddings de contenido guardados en:", out_dir / "content_embeddings.pkl")
