In [1]:
from pathlib import Path
import re
import numpy as np
import pandas as pd



In [2]:

# ⬇️ AJUSTA UNA DE ESTAS OPCIONES

# Opción A: RUTA ABSOLUTA (tu caso)
DIA_DIR = Path(r"C:\Users\gerar\Desktop\random\web-scraping\Supermercats\Data\Dia")


# (Opcional) nombre del CSV final
OUT = DIA_DIR / "dia_merged_clean.csv"

# Opción B: relativa al cuaderno (si el .ipynb está en Supermercats/)
# BONPREU_DIR = Path.cwd() / "Data" / "Bonpreu"

print("Existe carpeta?:", DIA_DIR.exists())
print("Carpeta:", DIA_DIR)

# Lista TODO lo que hay
print("\nContenido de la carpeta:")
for p in DIA_DIR.iterdir():
    print(" -", p.name)

# Prueba de patrones
cand_all = sorted(DIA_DIR.glob("*.csv"))
cand_bp  = sorted(DIA_DIR.glob("dia_*.csv"))

print("\n*.csv encontrados:", len(cand_all))
for p in cand_all: print("  ·", p.name)

print("\ndia_*.csv encontrados:", len(cand_bp))
for p in cand_bp: print("  ·", p.name)


Existe carpeta?: True
Carpeta: C:\Users\gerar\Desktop\random\web-scraping\Supermercats\Data\Dia

Contenido de la carpeta:
 - arroces.csv
 - dia_merged_clean.csv
 - garbanzos.csv
 - lentejas.csv
 - pastas.csv

*.csv encontrados: 5
  · arroces.csv
  · dia_merged_clean.csv
  · garbanzos.csv
  · lentejas.csv
  · pastas.csv

dia_*.csv encontrados: 1
  · dia_merged_clean.csv


# 2) Helpers de parseo (precios y tamaños)

In [3]:


def euro_to_float(s):
    if s is None or (isinstance(s, float) and np.isnan(s)): 
        return np.nan
    m = re.search(r"(\d+(?:[.,]\d+)?)", str(s))
    return float(m.group(1).replace(",", ".")) if m else np.nan

def parse_weight_kg(size):
    """Devuelve peso total en KG o np.nan si no se puede inferir."""
    if not size:
        return np.nan
    s = str(size).lower().replace("\u00a0", " ")

    # packs tipo '2 x 125 g' / '3x0,2 kg'
    m = re.search(r"(\d+)\s*x\s*([\d.,]+)\s*(kg|g)", s)
    if m:
        units = int(m.group(1))
        qty   = float(m.group(2).replace(",", "."))
        unit  = m.group(3)
        grams = qty * (1000 if unit == "kg" else 1)
        return (units * grams) / 1000.0

    # simple '500 g' / '0,5 kg'
    m = re.search(r"([\d.,]+)\s*(kg|g)", s)
    if m:
        qty  = float(m.group(1).replace(",", "."))
        unit = m.group(2)
        grams = qty * (1000 if unit == "kg" else 1)
        return grams / 1000.0

    return np.nan

def ppu_to_float(ppu):
    if not ppu: 
        return np.nan
    s = str(ppu).lower().replace("\u00a0", " ")
    m = re.search(r"(\d+(?:[.,]\d+)?)\s*€\s*(?:/|per)\s*(?:kilo|kg|quilo)", s) \
        or re.search(r"(\d+(?:[.,]\d+)?)\s*€/kg", s)
    return float(m.group(1).replace(",", ".")) if m else np.nan


# 3) Cargar y unir todos los CSV de la carpeta

In [4]:
csv_paths = sorted(DIA_DIR.glob("*.csv"))
if not csv_paths:
    raise FileNotFoundError(f"No hay CSV en {DIA_DIR}")

frames = []
for p in csv_paths:
    df = pd.read_csv(p, encoding="utf-8-sig")
    # normaliza nombres de columnas
    df.columns = [c.strip().lower() for c in df.columns]

    # map flexible (por si vienen de distintos scrapers)
    col_name = next((c for c in df.columns if c in ("name","nombre","título","titulo")), None)
    col_price = next((c for c in df.columns if c in ("price","precio","price (€)","preu","preu (€)")), None)
    col_ppu   = next((c for c in df.columns if c in ("price_per_unit","ppu","price_per_kg","€/kg","per_kg","price per unit")), None)
    col_size  = next((c for c in df.columns if c in ("size","tamaño","tamano","format","peso")), None)

    if col_name is None or col_price is None:
        # si no están, salta ese archivo
        print(f"⚠️  Saltando {p.name}: columnas clave no encontradas.")
        continue

    tmp = pd.DataFrame({
        "name": df[col_name].astype(str),
        "price_raw": df[col_price].astype(str),
        "ppu_raw": df[col_ppu].astype(str) if col_ppu else None,
        "size": df[col_size].astype(str) if col_size else None,
    })
    tmp["price (€)"] = tmp["price_raw"].apply(euro_to_float)
    tmp["price_per_kg"] = tmp["ppu_raw"].apply(ppu_to_float)

    # Completar €/kg con price/weight solo donde falte y el peso sea válido
    idx = tmp.index[tmp["price_per_kg"].isna()]
    if len(idx):
        kg = tmp.loc[idx, "size"].apply(parse_weight_kg)
        valid = kg.notna() & (kg > 0) & tmp.loc[idx, "price (€)"].notna()
        tmp.loc[idx, "price_per_kg"] = np.where(
            valid, tmp.loc[idx, "price (€)"] / kg, np.nan
        )


    mask_missing = tmp["price_per_kg"].isna()
    if mask_missing.any():
        kg = tmp.loc[mask_missing, "size"].apply(parse_weight_kg)
        tmp.loc[mask_missing, "price_per_kg"] = (
            tmp.loc[mask_missing, "price (€)"].values / kg.values
        )

    frames.append(tmp[["name","price (€)","price_per_kg","size"]])

df_all = pd.concat(frames, ignore_index=True)
print(df_all.shape)
df_all.head(10)


(242, 4)


Unnamed: 0,name,price (€),price_per_kg,size
0,Arroz extra Arrozona de Dia paquete 1 Kg,1.3,1.3,
1,Arroz largo de primera categoría Arrozona de D...,1.35,1.35,
2,Arroz basmati Selección Mundial de Dia paquete...,2.2,2.2,
3,Arroz redondo Al Punto Dia vaso 2 x 125 g,1.19,4.76,
4,Arroz redondo SOS paquete 1 Kg,1.88,1.88,
5,Arroz basmati Brillante vaso 2 x 125 g,1.38,5.52,
6,Arroz redondo Brillante paquete 1 Kg,2.79,2.79,
7,Arroz vaporizado Arrozona de Dia paquete 1 Kg,1.65,1.65,
8,Arroz integral Brillante vaso 2 x 125 g,1.25,5.0,
9,Arroz integral Arrozona de Dia paquete 1 Kg,1.65,1.65,


# 4) Limpieza y columnas finales

In [5]:
# quita filas sin precio o sin €/kg calculable
df_all = df_all.dropna(subset=["price (€)","price_per_kg"]).copy()

# dedupe conservando el más barato por (name, size)
df_all = (
    df_all.sort_values(["name","size","price_per_kg"])
          .drop_duplicates(subset=["name","size"], keep="first")
          .reset_index(drop=True)
)

# Si no te interesa 'size' en el resultado final:
df_view = df_all[["name","price (€)","price_per_kg"]].copy()

# ordena por €/kg ascendente
df_view = df_view.sort_values("price_per_kg", ascending=True).reset_index(drop=True)
df_view.head(20)


Unnamed: 0,name,price (€),price_per_kg
0,Espagueti Al Diante Dia bolsa 1 Kg,1.2,1.2
1,Macarrón Al Diante Dia bolsa 1 Kg,1.2,1.2
2,Arroz extra Arrozona de Dia paquete 1 Kg,1.3,1.3
3,Arroz largo de primera categoría Arrozona de D...,1.35,1.35
4,Fideo entrefino Al Diante Dia bolsa 500 g,0.8,1.6
5,Piñones Al Diante Dia bolsa 500 g,0.8,1.6
6,Fideuá Al Diante Dia bolsa 500 g,0.8,1.6
7,Macarrón Al Diante Dia bolsa 500 g,0.8,1.6
8,Fideo cabellín Al Diante Dia bolsa 500 g,0.8,1.6
9,Sopa maravilla Al Diante Dia bolsa 500 g,0.8,1.6


In [6]:
df_view.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          121 non-null    object 
 1   price (€)     121 non-null    float64
 2   price_per_kg  121 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.0+ KB


# 5) Tipos y guardados

In [7]:
print(df_view.dtypes)
df_view.to_csv(OUT, index=False, encoding="utf-8-sig")
print(f"✅ Guardado: {OUT}  |  filas={len(df_view)}")


name             object
price (€)       float64
price_per_kg    float64
dtype: object
✅ Guardado: C:\Users\gerar\Desktop\random\web-scraping\Supermercats\Data\Dia\dia_merged_clean.csv  |  filas=121
