In [4]:
import pandas as pd, numpy as np, json
from pathlib import Path

# -------- Config m√≠nimos --------
MIN_ROWS  = 1000
MIN_USERS = 25
MIN_ITEMS = 25
BATCH_SIZE = 100  # tama√±o de bloque para "agregar paulatinamente"

# -------- Helpers de rutas --------
def find_dir_up(name: str, start: Path | None = None, max_up: int = 6) -> Path:
    p = start or Path.cwd()
    for _ in range(max_up + 1):
        d = p / name
        if d.is_dir():
            return d.resolve()
        p = p.parent
    raise FileNotFoundError(f"No se encontr√≥ ./{name} hacia arriba de {Path.cwd()}")

DATA = find_dir_up("data")
BASE = (DATA / "personalize"); BASE.mkdir(parents=True, exist_ok=True)
MIN_DIR = (BASE / "min"); MIN_DIR.mkdir(parents=True, exist_ok=True)
OPT_DIR = (BASE / "opt"); OPT_DIR.mkdir(parents=True, exist_ok=True)

# -------- Carga fuente --------
src_csv  = DATA / "data_consumos_tarjetas.csv"
src_xlsx = DATA / "data_consumos_tarjetas.xlsx"

if src_csv.exists():
    consumos = pd.read_csv(src_csv)
elif src_xlsx.exists():
    consumos = pd.read_excel(src_xlsx)
else:
    raise FileNotFoundError("No encuentro data_consumos_tarjetas.csv ni .xlsx en ./data")

# -------- Validaciones b√°sicas --------
def to_epoch_col(series: pd.Series) -> pd.Series:
    ts = pd.to_datetime(series, utc=True, errors="coerce")
    return (ts.view("int64") // 10**9)

def require(df, cols, name):
    miss = [c for c in cols if c not in df.columns]
    if miss:
        raise ValueError(f"{name}: faltan columnas {miss}")

require(consumos, ["nro_documento", "Rubro", "fecha"], "data_consumos_tarjetas")

# -------- Construcci√≥n de interactions --------
cols_opt = []
if "Monto" in consumos.columns: cols_opt.append("AMOUNT")
if "TipoTransaccion" in consumos.columns: cols_opt.append("EVENT_TYPE")

interactions = consumos.rename(columns={
    "nro_documento": "USER_ID",
    "Rubro": "ITEM_ID",
    "fecha": "TIMESTAMP",
    "Monto": "AMOUNT",
    "TipoTransaccion": "EVENT_TYPE",
})[["USER_ID","ITEM_ID","TIMESTAMP"] + cols_opt].copy()

interactions["USER_ID"]   = interactions["USER_ID"].astype(str)
interactions["ITEM_ID"]   = interactions["ITEM_ID"].astype(str)
interactions["TIMESTAMP"] = to_epoch_col(interactions["TIMESTAMP"])

if "AMOUNT" in interactions.columns:
    interactions["AMOUNT"] = pd.to_numeric(interactions["AMOUNT"], errors="coerce").fillna(0.0)

# Limpieza y deduplicado
interactions = interactions.dropna(subset=["USER_ID","ITEM_ID","TIMESTAMP"])
interactions = interactions.drop_duplicates(subset=["USER_ID","ITEM_ID","TIMESTAMP"], keep="last").copy()
interactions["is_synth"] = False

def stats(df):
    return dict(
        rows=int(len(df)),
        users=int(df["USER_ID"].nunique() if len(df) else 0),
        items=int(df["ITEM_ID"].nunique() if len(df) else 0),
        ts_min=int(df["TIMESTAMP"].min()) if len(df) else None,
        ts_max=int(df["TIMESTAMP"].max()) if len(df) else None,
        synth=float(df["is_synth"].mean()) if "is_synth" in df.columns and len(df) else 0.0,
    )

initial = stats(interactions)

# -------- ‚ÄúAgregar paulatinamente‚Äù si falta --------
rng = np.random.default_rng(42)
DEDUP_KEYS = ["USER_ID","ITEM_ID","TIMESTAMP"]

def augment_batch(df: pd.DataFrame, n: int) -> pd.DataFrame:
    if df.empty: 
        return pd.DataFrame(columns=df.columns)
    users = df["USER_ID"].value_counts(normalize=True)
    items = df["ITEM_ID"].value_counts(normalize=True)
    ts_min, ts_max = int(df["TIMESTAMP"].min()), int(df["TIMESTAMP"].max())

    batch = pd.DataFrame({
        "USER_ID": rng.choice(users.index, size=n, p=users.values, replace=True),
        "ITEM_ID": rng.choice(items.index, size=n, p=items.values, replace=True),
        "TIMESTAMP": rng.integers(ts_min, ts_max + 1, size=n, dtype=np.int64),
        "is_synth": True
    })

    if "AMOUNT" in df.columns:
        base = df["AMOUNT"].dropna()
        if len(base) == 0:
            batch["AMOUNT"] = 0.0
        else:
            sampled = rng.choice(base.values, size=n, replace=True)
            noise = rng.normal(0.0, max(1.0, base.std(ddof=0) * 0.05), size=n)
            batch["AMOUNT"] = np.maximum(0.0, sampled + noise)

    if "EVENT_TYPE" in df.columns:
        base_evt = df["EVENT_TYPE"].astype(str).fillna("event")
        vals, probs = np.unique(base_evt, return_counts=True)
        probs = probs / probs.sum()
        batch["EVENT_TYPE"] = rng.choice(vals, size=n, p=probs, replace=True)

    return batch.reindex(columns=df.columns).drop_duplicates(subset=DEDUP_KEYS)

cur = interactions.copy()
while (
    (len(cur) < MIN_ROWS) or
    (cur["USER_ID"].nunique() < MIN_USERS) or
    (cur["ITEM_ID"].nunique() < MIN_ITEMS)
):
    add = augment_batch(cur, BATCH_SIZE)

    # Forzar variedad si faltan uniques
    if cur["USER_ID"].nunique() < MIN_USERS and len(add):
        need = MIN_USERS - cur["USER_ID"].nunique()
        base_users = cur["USER_ID"].value_counts().index.tolist()[:max(1, need)]
        for i in range(min(need, len(add))):
            add.loc[add.index[i], "USER_ID"] = f"{base_users[i % len(base_users)]}_new{rng.integers(1,10**6)}"

    if cur["ITEM_ID"].nunique() < MIN_ITEMS and len(add):
        need = MIN_ITEMS - cur["ITEM_ID"].nunique()
        base_items = cur["ITEM_ID"].value_counts().index.tolist()[:max(1, need)]
        for i in range(min(need, len(add))):
            add.loc[add.index[i], "ITEM_ID"] = f"{base_items[i % len(base_items)]}_alt{rng.integers(1,10**6)}"

    cur = pd.concat([cur, add], ignore_index=True)
    cur = cur.drop_duplicates(subset=DEDUP_KEYS, keep="last")

# Orden y recorte (no eliminar reales)
cur = cur.sort_values("TIMESTAMP").reset_index(drop=True)
target_rows = max(MIN_ROWS, len(interactions))
if len(cur) > target_rows:
    cur = cur.iloc[:target_rows].copy()

final = stats(cur)

# -------- Guardado MIN (obligatorio para Personalize) --------
interactions_path = MIN_DIR / "interactions_mvp_validated.csv"
cur.drop(columns=["is_synth"]).to_csv(interactions_path, index=False)

schema_interactions = {
  "type": "record", "name": "Interactions",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
    {"name": "USER_ID", "type": "string"},
    {"name": "ITEM_ID", "type": "string"},
    {"name": "TIMESTAMP", "type": "long"},
    {"name": "AMOUNT", "type": ["null","float"], "default": None},
    {"name": "EVENT_TYPE", "type": ["null","string"], "default": None}
  ],
  "version": "1.0"
}
with open(MIN_DIR / "schema_interactions.json", "w", encoding="utf-8") as f:
    json.dump(schema_interactions, f, indent=2)

# -------- Guardado OPT (users/items opcionales) --------
pd.DataFrame({"USER_ID": cur["USER_ID"].unique()}).to_csv(OPT_DIR / "users_mvp_min.csv", index=False)
pd.DataFrame({
    "ITEM_ID": pd.unique(cur["ITEM_ID"]),
    "title":   pd.unique(cur["ITEM_ID"]),
    "domain":  ["rubro"] * cur["ITEM_ID"].nunique()
}).to_csv(OPT_DIR / "items_mvp_min.csv", index=False)

schema_users = {
  "type": "record", "name": "Users",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
    {"name": "USER_ID", "type": "string"},
    {"name": "segment", "type": ["null","string"], "default": None},
    {"name": "age", "type": ["null","int"], "default": None},
    {"name": "department", "type": ["null","string"], "default": None},
    {"name": "province", "type": ["null","string"], "default": None},
    {"name": "district", "type": ["null","string"], "default": None}
  ],
  "version": "1.0"
}
with open(OPT_DIR / "schema_users.json", "w", encoding="utf-8") as f:
    json.dump(schema_users, f, indent=2)

schema_items = {
  "type": "record", "name": "Items",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
    {"name": "ITEM_ID", "type": "string"},
    {"name": "title", "type": ["null","string"], "default": None},
    {"name": "domain", "type": ["null","string"], "default": None}
  ],
  "version": "1.0"
}
with open(OPT_DIR / "schema_items.json", "w", encoding="utf-8") as f:
    json.dump(schema_items, f, indent=2)

# -------- Verificaci√≥n y assert de m√≠nimos --------
print("Initial:", initial)
print("Final  :", final)

assert final["rows"]  >= MIN_ROWS,  f"rows {final['rows']} < {MIN_ROWS}"
assert final["users"] >= MIN_USERS, f"users {final['users']} < {MIN_USERS}"
assert final["items"] >= MIN_ITEMS, f"items {final['items']} < {MIN_ITEMS}"

print("\nListo ‚úÖ")
print("MIN ->", interactions_path, "y schema_interactions.json")
print("OPT -> users_mvp_min.csv, items_mvp_min.csv, schema_users.json, schema_items.json")

  return (ts.view("int64") // 10**9)


Initial: {'rows': 5477922, 'users': 100890, 'items': 285, 'ts_min': 1719792000, 'ts_max': 1753833600, 'synth': 0.0}
Final  : {'rows': 5477922, 'users': 100890, 'items': 285, 'ts_min': 1719792000, 'ts_max': 1753833600, 'synth': 0.0}

Listo ‚úÖ
MIN -> D:\repos-eddie\poc-recommendation-fb\data\personalize\min\interactions_mvp_validated.csv y schema_interactions.json
OPT -> users_mvp_min.csv, items_mvp_min.csv, schema_users.json, schema_items.json


In [5]:
import pandas as pd
from pathlib import Path

# Ruta al archivo de la carpeta MIN
path_csv = Path("data/personalize/min/interactions_mvp_validated.csv")

# Cargar dataset
df = pd.read_csv(path_csv)

# Mostrar informaci√≥n general
print("üìä Shape:", df.shape)
print("üß± Columnas:", df.columns.tolist())
print("\nPrimeras filas:")
display(df.head(5))

# Mostrar valores √∫nicos por columna (solo hasta 10 para no saturar)
print("\nüîπ Valores √∫nicos por campo (m√°x 10 muestras):")
for col in df.columns:
    uniques = df[col].dropna().unique()
    sample = uniques[:10] if len(uniques) > 10 else uniques
    print(f"\n{col}: {len(uniques)} √∫nicos")
    print("Ejemplo:", sample)

üìä Shape: (5477922, 5)
üß± Columnas: ['USER_ID', 'ITEM_ID', 'TIMESTAMP', 'AMOUNT', 'EVENT_TYPE']

Primeras filas:


Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,AMOUNT,EVENT_TYPE
0,7548266,"Supermercados, abarrotes",1719792000,27.0,Consumos Credito
1,46564290,Tiendas de productos varios,1719792000,19.3,Consumos Debito
2,46823325,Llamadas a trav√©s del uso de tel√©fonos de lect...,1719792000,5.0,Consumos Debito
3,46823325,"Programaci√≥n de computadoras, procesamiento de...",1719792000,18.5,Consumos Debito
4,29535842,"Supermercados, abarrotes",1719792000,20.9,Consumos Debito



üîπ Valores √∫nicos por campo (m√°x 10 muestras):

USER_ID: 100890 √∫nicos
Ejemplo: ['07548266' '46564290' '46823325' '29535842' '42860950' '73615200'
 '46805779' '25745599' '42214622' '42705582']

ITEM_ID: 284 √∫nicos
Ejemplo: ['Supermercados, abarrotes' 'Tiendas de productos varios'
 'Llamadas a trav√©s del uso de tel√©fonos de lectura de banda magn√©tica.'
 'Programaci√≥n de computadoras, procesamiento de datos, sistemas integrados y servicios de dise√±o'
 'Tiendas de comida- almacenes y mercados especialidades.'
 'Pagos de impuestos' 'Panader√≠as' 'Farmacias y boticas'
 'Cargos de llamadas' 'Calzado comercial.']

TIMESTAMP: 395 √∫nicos
Ejemplo: [1719792000 1719878400 1719964800 1720051200 1720137600 1720224000
 1720310400 1720396800 1720483200 1720569600]

AMOUNT: 270977 √∫nicos
Ejemplo: [27.  19.3  5.  18.5 20.9 20.7 14.3 20.  12.  82.9]

EVENT_TYPE: 9 √∫nicos
Ejemplo: ['Consumos Credito' 'Consumos Debito' 'Retiro' 'Consulta' 'Transferencia'
 'Disposicion de Efectivo Debito' 'Co

In [8]:
import pandas as pd
from pathlib import Path
import numpy as np

# --------- Config m√≠nima ----------
SRC = Path("data/personalize/min/interactions_mvp_validated.csv")
DST = Path("data/personalize/min/interactions_mvp_filtered1000.csv")
TARGET_ROWS = 1000
MIN_USERS = 25
MIN_ITEMS = 25
ALLOWED_EVENTS = {"Consumos Credito", "Consumos Debito"}  # ajusta si quieres

# --------- Cargar ----------
df = pd.read_csv(SRC)
# Si existe EVENT_TYPE, filtra a los que aportan se√±al de consumo/pago
if "EVENT_TYPE" in df.columns:
    df = df[df["EVENT_TYPE"].isin(ALLOWED_EVENTS)].copy()

# Limpieza m√≠nima + unicidad por interacci√≥n
df = df.dropna(subset=["USER_ID","ITEM_ID","TIMESTAMP"])
df = df.drop_duplicates(subset=["USER_ID","ITEM_ID","TIMESTAMP"], keep="last")

# --------- Validar m√≠nimos de unicidad antes de muestrear ----------
u_users = df["USER_ID"].nunique()
u_items = df["ITEM_ID"].nunique()
if u_users < MIN_USERS or u_items < MIN_ITEMS:
    raise ValueError(f"Despu√©s del filtro no se cumplen m√≠nimos: users={u_users} (>= {MIN_USERS}), items={u_items} (>= {MIN_ITEMS}).")

# --------- Muestreo a exactamente 1000 filas (estratificado simple) ----------
# Preferimos balancear por EVENT_TYPE si existe; si no, muestreo aleatorio simple
rng = 42
if "EVENT_TYPE" in df.columns:
    k = df["EVENT_TYPE"].nunique()
    per = max(1, TARGET_ROWS // k)
    sample = (
        df.groupby("EVENT_TYPE", group_keys=False)
          .apply(lambda g: g.sample(n=min(per, len(g)), random_state=rng))
          .reset_index(drop=True)
    )
    # completar si faltan filas
    if len(sample) < TARGET_ROWS:
        need = TARGET_ROWS - len(sample)
        extra = df.drop(sample.index, errors="ignore")
        extra = extra.sample(n=min(need, len(extra)), random_state=rng)
        sample = pd.concat([sample, extra], ignore_index=True)
else:
    sample = df.sample(n=min(TARGET_ROWS, len(df)), random_state=rng)

# Si por alg√∫n motivo a√∫n falta, completar con aleatorio (sin reemplazo si se puede)
if len(sample) < TARGET_ROWS and len(df) > len(sample):
    need = TARGET_ROWS - len(sample)
    extra = df.drop(sample.index).sample(n=min(need, len(df)-len(sample)), random_state=rng)
    sample = pd.concat([sample, extra], ignore_index=True)

# Si sobra (raro), recorta
sample = sample.sample(n=TARGET_ROWS, random_state=rng) if len(sample) >= TARGET_ROWS else sample

# --------- Validaciones finales ----------
assert len(sample) == min(TARGET_ROWS, len(df)), f"Filas={len(sample)}"
assert sample["USER_ID"].nunique() >= MIN_USERS, "No se cumple m√≠nimo de usuarios."
assert sample["ITEM_ID"].nunique() >= MIN_ITEMS, "No se cumple m√≠nimo de √≠tems."

# --------- Guardar ----------
DST.parent.mkdir(parents=True, exist_ok=True)
sample.to_csv(DST, index=False)

print("‚úÖ Listo:", DST)
print("rows:", len(sample),
      "| users:", sample["USER_ID"].nunique(),
      "| items:", sample["ITEM_ID"].nunique(),
      "| event_types:", sample["EVENT_TYPE"].nunique() if "EVENT_TYPE" in sample.columns else "-")
print("Primeras filas:")
display(sample.head(5))

‚úÖ Listo: data\personalize\min\interactions_mvp_filtered1000.csv
rows: 1000 | users: 973 | items: 92 | event_types: 2
Primeras filas:


  .apply(lambda g: g.sample(n=min(per, len(g)), random_state=rng))


Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,AMOUNT,EVENT_TYPE
521,40006125,"Supermercados, abarrotes",1742601600,38.92,Consumos Debito
737,40620726,Tiendas por departamento,1727654400,79.9,Consumos Debito
740,40289830,Restaurantes y lugares para comer.,1726012800,40.0,Consumos Debito
660,7624233,Tiendas por departamento,1736467200,80.33,Consumos Debito
411,46121117,Ventas de seguros y primas,1742428800,107.49,Consumos Credito
