In [4]:
from __future__ import annotations
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
from typing import Dict, List
from random import Random
import unicodedata, hashlib, json, re, uuid
import pandas as pd

PATH_TICKETS = Path("tickets_sinteticos.json")    
PATH_PEAJES  = Path("peajes_ligeros.csv")        

OUT_JSON = Path("tickets_peaje.json")             
OUT_CSV  = Path("tickets_peaje.csv")              


def strip_accents(s: str) -> str:
    if s is None: return ""
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def norm_prov(s: str) -> str:
    s = strip_accents((s or "").strip().upper())
    s = (s.replace(" / VALENCIA","").replace(" / VALÈNCIA","")
           .replace(" / CASTELLON","").replace(" / CASTELLO","")
           .replace("(LAS)","").replace("(LA)",""))
    s = re.sub(r"\s+", " ", s).strip(" /")
    SYN = {"VIZCAYA":"BIZKAIA","GUIPUZCOA":"GIPUZKOA","GUIPUZKOA":"GIPUZKOA","LA CORUNA":"A CORUNA",
           "GERONA":"GIRONA","LERIDA":"LLEIDA","ALAVA/ARABA":"ALAVA","ARABA":"ALAVA"}
    return SYN.get(s, s)

def split_tramo(tramo: str) -> tuple[str,str]:
    if not isinstance(tramo, str): return "", ""
    parts = re.split(r"\s+[–-]\s+|\s+-\s+", tramo.strip())
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return tramo.strip(), ""

def eur_str(v: float) -> str:
    return f"{v:0.2f}".replace(".", ",") + " €"

def gen_ref(prefix: str = "PASO") -> str:
    return f"{prefix}-{uuid.uuid4().hex[:8].upper()}"

def _hash_to_minutes(key: str, span_min: int = 180) -> int:
    key = key or "fallback"
    h = hashlib.sha256(key.encode("utf-8")).hexdigest()
    n = int(h[:8], 16)
    off = (n % (2*span_min + 1)) - span_min
    return off if off != 0 else 7

def hora_desplazada(fecha: str, hora: str, key: str) -> str:
    base_dt = datetime.strptime(f"{fecha} {hora}", "%Y-%m-%d %H:%M:%S")
    new_dt = base_dt + timedelta(minutes=_hash_to_minutes(key, 180))
    # clamp dentro del día
    start = datetime.strptime(f"{fecha} 00:05:00", "%Y-%m-%d %H:%M:%S")
    end   = datetime.strptime(f"{fecha} 23:55:00", "%Y-%m-%d %H:%M:%S")
    if new_dt < start: new_dt = start
    if new_dt > end:   new_dt = end
    return new_dt.strftime("%H:%M:%S")


def load_base_tickets(path: Path) -> pd.DataFrame:
    data = json.loads(path.read_text(encoding="utf-8"))
    rows = []
    for t in data:
        est = t.get("estacion") or {}
        rows.append({
            "idTicket": t.get("idTicket"),
            "idEmpresa": t.get("idEmpresa"),
            "empresaNombre": t.get("empresaNombre"),
            "idUsuario": t.get("idUsuario"),
            "provincia": est.get("provincia"),
            "provincia_norm": norm_prov(est.get("provincia")),
            "fecha": t.get("fechaEmision"),      
            "hora": t.get("horaEmision"),        
            "metodoPago": t.get("metodoPago"),
        })
    return pd.DataFrame(rows).dropna(subset=["provincia"])

AUTOPISTA_PROVINCIAS = {
    "AP-1":["BURGOS","ALAVA"], "AP-2":["ZARAGOZA","LLEIDA","BARCELONA"], "AP-4":["SEVILLA","CADIZ"],
    "AP-6":["MADRID","SEGOVIA","AVILA"], "AP-7":["GIRONA","BARCELONA","TARRAGONA","CASTELLON","VALENCIA","ALICANTE","MALAGA","CADIZ"],
    "AP-8":["BIZKAIA","GIPUZKOA"], "AP-9":["A CORUNA","PONTEVEDRA"], "AP-41":["MADRID","TOLEDO"],
    "AP-42":["MADRID","TOLEDO"], "AP-46":["MALAGA"], "AP-51":["AVILA","SEGOVIA"],
    "AP-53":["A CORUNA","PONTEVEDRA"], "AP-61":["SEGOVIA","MADRID"], "AP-66":["ASTURIAS","LEON"],
    "AP-68":["BIZKAIA","ALAVA","LA RIOJA","ZARAGOZA"], "AP-71":["LEON"], "AP-36":["TOLEDO","CUENCA","ALBACETE"]
}

PROV_KEYWORDS = {
    "MADRID":["MADRID","VILLALBA","SAN RAFAEL","RIVAS"], "SEGOVIA":["SEGOVIA","SAN RAFAEL","VILLACASTIN","ADANERO"],
    "AVILA":["AVILA","VILLACASTIN","ADANERO"], "LEON":["LEON","LA MAGDALENA","OBLANCA","ASTORGA","VILLADANGOS"],
    "ASTURIAS":["CAMPOMANES","OVIEDO","MIERES"], "A CORUNA":["SANTIAGO","A CORUNA","RIBADULLA"],
    "PONTEVEDRA":["SILLEDA","LALIN","BANDEIRA","PONTEVEDRA","VIGO"], "BIZKAIA":["BILBAO","ARRIGORRIAGA","BARAKALDO"],
    "ALAVA":["LLODIO","ALTUBE","SUBIJANA","ZAMBRANA","VITORIA"], "LA RIOJA":["LOGRONO","HARO","CENICERO","ALFARO"],
    "ZARAGOZA":["ZARAGOZA","GALLUR","ALAGON","TUDELA"]
}

def load_peajes(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="utf-8")
    colmap = {"Concesionaria":"concesionaria","Autopista":"autopista","Tramo":"tramo","Precio (€)":"precio_ligeros","Fecha":"fecha_tarifa"}
    for k,v in colmap.items():
        if k in df.columns: df.rename(columns={k:v}, inplace=True)
    needed = {"concesionaria","autopista","tramo","precio_ligeros"}
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"Faltan columnas en peajes CSV: {missing}. Esperadas: {sorted(needed)}")
    for c in ["concesionaria","autopista","tramo"]:
        df[c] = df[c].astype(str).str.strip()
    od = df["tramo"].apply(split_tramo)
    df["origen"]  = od.apply(lambda x: x[0])
    df["destino"] = od.apply(lambda x: x[1])
    df["precio_ligeros"] = df["precio_ligeros"].astype(str).str.replace(",", ".", regex=False).astype(float)
    df["autopista_norm"] = df["autopista"].str.upper().str.replace(" ", "", regex=False)
    df["provincias_norm"] = df["autopista_norm"].map(lambda ap: [p for p in AUTOPISTA_PROVINCIAS.get(ap, [])])
    def detect_from_tramo(tramo_txt: str):
        txt = strip_accents((tramo_txt or "").upper())
        found = set()
        for prov,kws in PROV_KEYWORDS.items():
            if any(strip_accents(kw) in txt for kw in kws):
                found.add(prov)
        return sorted(found)
    empty = df["provincias_norm"].apply(lambda xs: len(xs)==0)
    if empty.any():
        df.loc[empty,"provincias_norm"] = df.loc[empty,"tramo"].apply(detect_from_tramo)
    df = df[df["provincias_norm"].apply(lambda xs: len(xs)>0)].copy()
    return df

def build_index_por_prov(df_peajes: pd.DataFrame) -> Dict[str, List[dict]]:
    idx = defaultdict(list)
    for _, r in df_peajes.iterrows():
        item = {
            "concesionaria": r["concesionaria"],
            "autopista": r["autopista"],
            "origen": r.get("origen",""),
            "destino": r.get("destino",""),
            "precio_ligeros": float(r["precio_ligeros"]),
        }
        for p in r["provincias_norm"]:
            idx[p].append(item)
    return idx

_rng = Random(42)

def generar_ticket_peaje(row: pd.Series, idx_peajes: Dict[str, List[dict]]) -> dict | None:
    prov = row["provincia_norm"]
    candidatos = idx_peajes.get(prov, [])
    if not candidatos:
        return None
    tramo = _rng.choice(candidatos)
    key = row.get("idTicket") or row.get("idUsuario") or f"{row['provincia_norm']}-{row['fecha']}-{row['hora']}"
    hora_alt = hora_desplazada(row["fecha"], row["hora"], key)
    return {
        "tipoDocumento": "Peaje",
        "concesionaria": tramo["concesionaria"],
        "autopista": tramo["autopista"],
        "localizacion": {
            "tramo": f"{tramo['origen']} – {tramo['destino']}",
            "entrada": tramo["origen"],
            "salida":  tramo["destino"],
        },
        "fechaHora": f"{row['fecha']} {hora_alt}",
        "categoriaVehiculo": "B",
        "importe": eur_str(tramo["precio_ligeros"]),
        "ivaIncluido": "IVA incluido",
        "formaPago": row.get("metodoPago") or "Tarjeta crédito",
        "referencia": gen_ref("PASO"),
        "idEmpresa": row.get("idEmpresa"),
        "empresaNombre": row.get("empresaNombre"),
        "idUsuario": row.get("idUsuario"),
        "provincia": row.get("provincia"),
    }

def flatten_for_csv(tk: dict) -> dict:
    d = dict(tk)
    loc = d.pop("localizacion", {}) or {}
    d["tramo"]   = loc.get("tramo")
    d["entrada"] = loc.get("entrada")
    d["salida"]  = loc.get("salida")
    return d

def generar_y_exportar_JSON(base_df: pd.DataFrame, idx_peajes: dict,
                            out_json: Path, out_csv: Path | None = None):
    tickets, omitidos = [], 0
    for _, row in base_df.iterrows():
        tk = generar_ticket_peaje(row, idx_peajes)
        if tk is None:
            omitidos += 1
            continue
        tickets.append(tk)

    out_json.parent.mkdir(parents=True, exist_ok=True)
    with out_json.open("w", encoding="utf-8") as f:
        json.dump(tickets, f, ensure_ascii=False, indent=2)

    if out_csv is not None:
        pd.DataFrame([flatten_for_csv(t) for t in tickets]).to_csv(out_csv, index=False, encoding="utf-8")

    print(f"Generados: {len(tickets)} | Omitidos (sin peaje): {omitidos}")
    print("JSON guardado en:", out_json.resolve())
    if out_csv is not None:
        print("CSV  guardado en:", out_csv.resolve())
    return tickets

base_df  = load_base_tickets(PATH_TICKETS)
peaje_df = load_peajes(PATH_PEAJES)
IDX_PEAJES = build_index_por_prov(peaje_df)

tickets_generados = generar_y_exportar_JSON(base_df, IDX_PEAJES, OUT_JSON, OUT_CSV)
print("Working directory:", Path.cwd().resolve())


Generados: 622 | Omitidos (sin peaje): 728
JSON guardado en: C:\Users\Jon\Documents\Repositorios\reto1-bridge-data\Script_generador_peajes\tickets_peaje.json
CSV  guardado en: C:\Users\Jon\Documents\Repositorios\reto1-bridge-data\Script_generador_peajes\tickets_peaje.csv
Working directory: C:\Users\Jon\Documents\Repositorios\reto1-bridge-data\Script_generador_peajes
