In [None]:
import pandas as pd
import numpy as np
import re, json
from pathlib import Path

In [95]:
EV_PATH  = Path(r"data/tickets_ev_sinteticos.json")
ICE_PATH = Path(r"data/tickets_sinteticos.json")

KGCO2_PER_L_GASOLINA = 2.35
KGCO2_PER_L_DIESEL   = 2.69
GRID_KGCO2_PER_KWH   = 0.283   # mix genérico (sin GdO)
LOSS_FACTOR_TD       = 1.096   # pérdidas T&D ~9.6 %

def _printable(s):
    s = "" if s is None else str(s)
    s = s.replace("\n"," ").replace("\r"," ")
    return re.sub(r"\s+", " ", s).strip()

def _norm(s: str) -> str:
    s = (str(s) if s is not None else "").strip().upper()
    rep = str.maketrans("ÁÉÍÓÚÜÑ", "AEIOUUN")
    return s.translate(rep)


In [96]:
CPO_TO_SUPPLIER = {
    _norm("ENDESA X WAY S.L."): _norm("ENDESA ENERGIA S.A.U."),
    _norm("IBERDROLA CLIENTES S.A.U"): _norm("IBERDROLA CLIENTES S.A.U."),
    _norm("REPSOL SOLUCIONES ENERGETICAS SA"): _norm("REPSOL ELECTRICIDAD Y GAS, S.L.U."),
    _norm("REPSOL COMERCIAL DE PRODUCTOS PETROLIFEROS SA"): _norm("REPSOL ELECTRICIDAD Y GAS, S.L.U."),
    _norm("WENEA"): _norm("ENDESA ENERGIA S.A.U."),
}
COR_SUPPLIERS = {
    _norm("CURENERGIA COMERCIALIZADOR DE ULTIMO RECURSO S.A.U."): GRID_KGCO2_PER_KWH,
    _norm("ENERGIA XXI COMERCIALIZADORA DE REFERENCIA, S.L.U."): GRID_KGCO2_PER_KWH,
    _norm("BASER COMERCIALIZADORA DE REFERENCIA, S.A."): GRID_KGCO2_PER_KWH,
    _norm("COMERCIALIZADORA REGULADA, GAS & POWER, S.A."): GRID_KGCO2_PER_KWH,
    _norm("REGSITI COMERCIALIZADORA REGULADA, S.L.U."): GRID_KGCO2_PER_KWH,
    _norm("TERAMELCOR, S.L."): GRID_KGCO2_PER_KWH,
}
SUPPLIER_FACTOR_KG_PER_KWH = {
    _norm("ENDESA ENERGIA S.A.U."): 0.258,
    _norm("IBERDROLA CLIENTES S.A.U."): 0.120,
    _norm("EDP COMERCIALIZADORA, S.A.U."): 0.240,
}

In [97]:
def _supplier_factor(supplier: str|None) -> float:
    if not supplier: return GRID_KGCO2_PER_KWH
    s = _norm(supplier)
    if s in COR_SUPPLIERS: return float(COR_SUPPLIERS[s])
    v = SUPPLIER_FACTOR_KG_PER_KWH.get(s, None)
    return GRID_KGCO2_PER_KWH if v is None else float(v)

def _factor_por_operador(operador: str|None) -> float:
    if not operador: return GRID_KGCO2_PER_KWH
    sup = CPO_TO_SUPPLIER.get(_norm(operador))
    return _supplier_factor(sup)

def _factor_combustible(fuel) -> float:
    if fuel is None or (isinstance(fuel, float) and np.isnan(fuel)):
        return KGCO2_PER_L_GASOLINA
    s = str(fuel).strip().lower()
    if any(w in s for w in ["gasoleo","gasoil","diesel","diésel"]):
        return KGCO2_PER_L_DIESEL
    return KGCO2_PER_L_GASOLINA

In [98]:
def _alias_norm(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    lower = {c.lower(): c for c in out.columns}
    def rn(cands, target):
        for a in cands:
            if a in lower: out.rename(columns={lower[a]: target}, inplace=True); return
    rn(["idempresa","empresa","company","id_empresa","empresa_id"], "idEmpresa")
    rn(["idusuario","usuario","user","user_id","userid","usuarioid"], "idUsuario")
    rn(["idvehiculo","vehiculo","vehiculo_id","vehicle_id"], "idVehiculo")
    rn(["propulsion","tipo","tipo_vehiculo","powertrain"], "propulsion")
    rn(["mes","month","period"], "mes")
    rn(["kwh","energia","energy_kwh"], "kwh")
    rn(["litros","liters","litres","l"], "litros")
    rn(["fuel","combustible","carburante"], "fuel")
    rn(["supplier","comercializadora","retailer","marketer"], "supplier")
    rn(["operador","cpo","operador_carga","operador_ev"], "operador")
    if "mes" in out.columns: out["mes"] = out["mes"].astype(str)
    return out

In [99]:
def _ensure_idVehiculo(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c, val in [("idEmpresa","NA"),("idUsuario","NA"),("propulsion","EV")]:
        if c not in out.columns: out[c] = val
    if "idVehiculo" not in out.columns or out["idVehiculo"].isna().any():
        base = out.apply(lambda r: f"{_printable(r.get('idEmpresa'))}-{_printable(r.get('idUsuario'))}-{_printable(r.get('propulsion'))}", axis=1)
        out["idVehiculo"] = out.get("idVehiculo", base).fillna(base)
    return out

In [100]:
def _compute_emissions_if_missing(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if "kwh"   not in out.columns: out["kwh"] = np.nan
    if "litros" not in out.columns: out["litros"] = np.nan
    sup = out["supplier"] if "supplier" in out.columns else None
    op  = out["operador"] if "operador" in out.columns else None
    fac_ele = []
    for i in range(len(out)):
        s = None if sup is None else sup.iloc[i]
        o = None if op  is None else op.iloc[i]
        f = _supplier_factor(s) if (s not in (None, "", np.nan)) else _factor_por_operador(o)
        fac_ele.append(GRID_KGCO2_PER_KWH if f is None else float(f))
    out["factor_ele_kg_per_kwh"] = pd.Series(fac_ele, index=out.index)
    out["loss_TD"] = float(LOSS_FACTOR_TD)
    if "fuel" in out.columns:
        out["factor_comb_kg_per_l"] = out["fuel"].apply(_factor_combustible)
    else:
        out["factor_comb_kg_per_l"] = np.where(
            out["propulsion"].astype(str).str.upper().eq("ICE"),
            KGCO2_PER_L_GASOLINA, np.nan
        )
    is_ev  = out["propulsion"].astype(str).str.upper().eq("EV")
    is_ice = out["propulsion"].astype(str).str.upper().eq("ICE")
    kwh_num    = pd.to_numeric(out["kwh"], errors="coerce")
    litros_num = pd.to_numeric(out["litros"], errors="coerce")
    if "kgCO2e_ev" not in out.columns:
        out["kgCO2e_ev"] = np.where(
            is_ev & kwh_num.notna(),
            kwh_num.astype(float) * out["loss_TD"].astype(float) * out["factor_ele_kg_per_kwh"].astype(float),
            np.nan
        )
    if "kgCO2_ice" not in out.columns:
        out["kgCO2_ice"] = np.where(
            is_ice & litros_num.notna(),
            litros_num.astype(float) * out["factor_comb_kg_per_l"].astype(float),
            np.nan
        )
    if "kgCO2_total" not in out.columns:
        out["kgCO2_total"] = out[["kgCO2e_ev","kgCO2_ice"]].sum(axis=1, skipna=True)
        out.loc[out[["kgCO2e_ev","kgCO2_ice"]].isna().all(axis=1), "kgCO2_total"] = np.nan
    return out


In [101]:
def build_company_user_vehicle_df(df_in: pd.DataFrame) -> pd.DataFrame:
    if df_in is None or df_in.empty:
        return pd.DataFrame(columns=[
            "idEmpresa","idUsuario","idVehiculo","propulsion",
            "total_kwh_ev","total_litros_ice","ev_kgCO2e_total","ice_kgCO2_total","kgCO2_total"
        ])
    df = _alias_norm(df_in)
    df = _ensure_idVehiculo(df)
    df = _compute_emissions_if_missing(df)
    grp = ["idEmpresa","idUsuario","idVehiculo","propulsion"]
    for c in grp:
        if c not in df.columns: df[c] = np.nan
    out = (df.groupby(grp, dropna=False)
             .agg(
                 total_kwh_ev=("kwh", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 total_litros_ice=("litros", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 ev_kgCO2e_total=("kgCO2e_ev", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 ice_kgCO2_total=("kgCO2_ice", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 kgCO2_total=("kgCO2_total", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
             )
             .reset_index()
           )
    return out.sort_values(["idEmpresa","idUsuario","idVehiculo"]).reset_index(drop=True)

def build_company_user_vehicle_month_df(df_in: pd.DataFrame) -> pd.DataFrame:
    if df_in is None or df_in.empty:
        return pd.DataFrame(columns=[
            "idEmpresa","idUsuario","idVehiculo","propulsion","mes",
            "ev_kwh_mes","ice_litros_mes","ev_kgCO2e_mes","ice_kgCO2_mes","kgCO2_mes_total"
        ])
    df = _alias_norm(df_in)
    if "mes" not in df.columns:
        if "fechaEmision" in df.columns:
            df["mes"] = df["fechaEmision"].astype(str).str.slice(0,7)
        else:
            df["mes"] = None
    df = _ensure_idVehiculo(df)
    df = _compute_emissions_if_missing(df)

    grp = ["idEmpresa","idUsuario","idVehiculo","propulsion","mes"]
    for c in grp:
        if c not in df.columns: df[c] = np.nan

    out = (df.groupby(grp, dropna=False)
             .agg(
                 ev_kwh_mes=("kwh", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 ice_litros_mes=("litros", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 ev_kgCO2e_mes=("kgCO2e_ev", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 ice_kgCO2_mes=("kgCO2_ice", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
                 kgCO2_mes_total=("kgCO2_total", lambda s: float(np.nansum(pd.to_numeric(s, errors="coerce")))),
             )
             .reset_index()
           )
    return out.sort_values(["idEmpresa","idUsuario","idVehiculo","mes"]).reset_index(drop=True)

In [None]:
def export_company_df(company_df: pd.DataFrame, out_dir: Path | str = "data", base_name: str = "company_sustainability"):
    if company_df is None or company_df.empty:
        raise ValueError("company_df está vacío: nada que exportar.")
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    records = []
    for _, row in company_df.iterrows():
        d = row.to_dict()
        for k, v in list(d.items()):
            if isinstance(v, float) and np.isnan(v):
                d[k] = None
        records.append(d)
    p_json  = out_dir / f"{base_name}.json"
    p_jsonl = out_dir / f"{base_name}.jsonl"
    with p_json.open("w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False)
    with p_jsonl.open("w", encoding="utf-8") as f:
        for d in records:
            f.write(json.dumps(d, ensure_ascii=False) + "\n")

def _read_json(path: Path) -> pd.DataFrame:
    if not Path(path).exists(): return pd.DataFrame()
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return pd.DataFrame(data)

def _sum_ev_kwh_from_lineas(lineas):
    if not isinstance(lineas, (list,tuple)): return np.nan
    t = 0.0; ok = False
    for it in lineas:
        if not isinstance(it, dict): continue
        prod = str(it.get("producto","")).upper()
        if "ELECTRIC" in prod:
            k = it.get("kwh")
            if k is None: continue
            try:
                t += float(str(k).replace(",", "."))
                ok = True
            except:
                pass
    return t if ok else np.nan

def _sum_ice_litros_from_lineas(lineas):
    if not isinstance(lineas, (list,tuple)): return np.nan
    t = 0.0; ok = False
    for it in lineas:
        if not isinstance(it, dict): continue
        prod = str(it.get("producto","")).upper()
        uni  = str(it.get("unidad","")).upper()
        cant = it.get("cantidad")
        if cant is None: continue
        try: cant = float(str(cant).replace(",", "."))
        except: continue
        if "L" == uni or "LIT" in uni or "GASOL" in prod or "DIE" in prod:
            t += cant; ok = True
    return t if ok else np.nan

def _first_non_null(s: pd.Series):
    for v in s:
        if pd.notna(v) and v != "": return v
    return None

def _prep_ev(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: return df
    out = df.copy()
    if "idEmpresa" not in out.columns and "empresa" in out.columns:
        out.rename(columns={"empresa":"idEmpresa"}, inplace=True)
    if "fechaEmision" in out.columns:
        out["mes"] = out["fechaEmision"].astype(str).str.slice(0,7)
    if "kwh" not in out.columns:
        out["kwh"] = out["lineas"].apply(_sum_ev_kwh_from_lineas) if "lineas" in out.columns else np.nan
    if "operador" not in out.columns:
        out["operador"] = None
    out["propulsion"] = "EV"
    keep = ["idEmpresa","idUsuario","propulsion","mes","kwh","operador","supplier"]
    for c in keep:
        if c not in out.columns: out[c] = None
    out = out[keep]
    out["kwh"] = pd.to_numeric(out["kwh"], errors="coerce")
    g = (out.groupby(["idEmpresa","idUsuario","propulsion","mes"], dropna=False)
             .agg(kwh=("kwh","sum"),
                  operador=("operador", _first_non_null),
                  supplier=("supplier", _first_non_null))
             .reset_index())
    return g

def _prep_ice(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: return df
    out = df.copy()
    if "idEmpresa" not in out.columns and "empresa" in out.columns:
        out.rename(columns={"empresa":"idEmpresa"}, inplace=True)
    if "fechaEmision" in out.columns:
        out["mes"] = out["fechaEmision"].astype(str).str.slice(0,7)
    if "litros" not in out.columns:
        out["litros"] = out["lineas"].apply(_sum_ice_litros_from_lineas) if "lineas" in out.columns else np.nan
    if "fuel" not in out.columns:
        out["fuel"] = out.get("lineas", pd.Series([None]*len(out))).apply(
            lambda ls: _first_non_null(pd.Series([str(it.get("producto","")).lower() for it in ls])) if isinstance(ls, list) else None
        )
    out["propulsion"] = "ICE"
    keep = ["idEmpresa","idUsuario","propulsion","mes","litros","fuel"]
    for c in keep:
        if c not in out.columns: out[c] = None
    out = out[keep]
    out["litros"] = pd.to_numeric(out["litros"], errors="coerce")
    g = (out.groupby(["idEmpresa","idUsuario","propulsion","mes"], dropna=False)
             .agg(litros=("litros","sum"),
                  fuel=("fuel", _first_non_null))
             .reset_index())
    return g

def main():
    Path("data").mkdir(parents=True, exist_ok=True)
    ev_raw  = _read_json(EV_PATH)
    ice_raw = _read_json(ICE_PATH)
    ev_agg  = _prep_ev(ev_raw)
    ice_agg = _prep_ice(ice_raw)
    df_in = pd.concat([ev_agg, ice_agg], ignore_index=True, sort=False).fillna({"kwh":np.nan,"litros":np.nan})
    company_df = build_company_user_vehicle_df(df_in)
    export_company_df(company_df, out_dir="data", base_name="company_sustainability")
    print("data")
    print([p.name for p in Path("data").glob("company_sustainability.*")])
    return company_df

def _coerce_num(x):
    try:
        return float(str(x).replace(",", "."))
    except:
        return np.nan

def _is_fuel_product(txt: str) -> bool:
    t = (txt or "").upper()
    return any(k in t for k in ["GASOL", "DIESEL", "DIÉSEL", "GASOIL", "GASÓLEO", "GASOLINA"])

def _is_liter_unit(txt: str) -> bool:
    t = (txt or "").strip().lower()
    return t in {"l","lt","ltr","litro","litros","liter","liters"}

def _sum_ice_litros_from_lineas(lineas):
    if not isinstance(lineas, (list,tuple)):
        return np.nan
    total = 0.0
    ok = False
    for it in lineas:
        if not isinstance(it, dict):
            continue
        prod = it.get("producto")
        uni  = it.get("unidad")
        cant = _coerce_num(it.get("cantidad"))
        lit  = _coerce_num(it.get("litros"))
        vol  = _coerce_num(it.get("volumen"))
        imp  = _coerce_num(it.get("importe"))
        pvu  = _coerce_num(it.get("precioUnitario") or it.get("precio_unitario") or it.get("precio"))
        got = np.nan
        if not np.isnan(lit):
            got = lit
        elif not np.isnan(vol):
            got = vol
        elif not np.isnan(cant) and (_is_liter_unit(uni) or _is_fuel_product(prod)):
            got = cant
        elif (not np.isnan(imp)) and (not np.isnan(pvu)) and pvu > 0 and _is_fuel_product(prod):
            got = imp / pvu
        if not np.isnan(got):
            total += got
            ok = True
    return total if ok else np.nan

def _infer_fuel_from_lineas(lineas):
    if not isinstance(lineas, (list,tuple)):
        return None
    for it in lineas:
        if not isinstance(it, dict):
            continue
        prod = str(it.get("producto","")).lower()
        if any(k in prod for k in ["diesel","diésel","gasoil","gasóleo"]):
            return "gasoleo"
        if "gasolin" in prod:
            return "gasolina"
    return None

def _prep_ice(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    out = df.copy()
    if "idEmpresa" not in out.columns and "empresa" in out.columns:
        out.rename(columns={"empresa":"idEmpresa"}, inplace=True)
    if "fechaEmision" in out.columns:
        out["mes"] = out["fechaEmision"].astype(str).str.slice(0,7)
    if "litros" not in out.columns:
        out["litros"] = out["lineas"].apply(_sum_ice_litros_from_lineas) if "lineas" in out.columns else np.nan
    if "fuel" not in out.columns:
        out["fuel"] = out.get("lineas", pd.Series([None]*len(out))).apply(_infer_fuel_from_lineas)
    out["propulsion"] = "ICE"
    keep = ["idEmpresa","idUsuario","propulsion","mes","litros","fuel"]
    for c in keep:
        if c not in out.columns:
            out[c] = None
    out = out[keep]
    out["litros"] = pd.to_numeric(out["litros"], errors="coerce")
    g = (out.groupby(["idEmpresa","idUsuario","propulsion","mes"], dropna=False)
             .agg(litros=("litros","sum"),
                  fuel=("fuel", lambda s: next((v for v in s if pd.notna(v) and v!=""), None)))
             .reset_index())
    return g

ev_raw  = _read_json(EV_PATH)
ice_raw = _read_json(ICE_PATH)
ev_agg  = _prep_ev(ev_raw)
ice_agg = _prep_ice(ice_raw)
df_in   = pd.concat([ev_agg, ice_agg], ignore_index=True, sort=False).fillna({"kwh":np.nan,"litros":np.nan})
company_df = build_company_user_vehicle_df(df_in)
export_company_df(company_df, out_dir="data", base_name="company_sustainability")
company_df.head()

company_df = main()

company_month_df = build_company_user_vehicle_month_df(df_in)
export_company_df(company_month_df, out_dir="data", base_name="company_sustainability_month")


data
['company_sustainability.json', 'company_sustainability.jsonl']
