# 02 — RAW → CLEAN (DuckDB) · OpenBDAP Saldi storici

Metodo **DataCivicLab Lab**:

- Legge il CSV salvato in `data/raw/<PROJECT>/<RUN_ID>/..._raw.csv`
- Crea tabella `raw` in DuckDB
- Crea tabella `clean` con:
  - **rename** a snake_case (e mapping semantico)
  - **cast** numerici a `DOUBLE`, anno a `INTEGER`
  - nessuna correzione/interpretazione dei valori
- Export `Parquet` in `data/clean/<PROJECT>/<RUN_ID>/saldi_storico.parquet`
- Salva `columns_mapping_raw_to_clean.json`, `profile_clean.json`, `clean_manifest.json`

> Dal preview: 22 righe (anni) e 22 colonne con nomi già semantici (ANNO, SALDO_NETTO, …)

In [None]:
# --- Colab: mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- CONFIG ---
from pathlib import Path
from datetime import datetime, timezone
import json, hashlib
import duckdb

PROJECT = "openbdap_rendiconto_saldi_storico"
DATASET_SLUG = "rendiconto_pubblicato_serie_storica_saldi"

ROOT = Path("/content/drive/MyDrive/DataCivicLab") # Aggiungi DataCivicLab come Scorciatoia da Drive
RAW_ROOT = ROOT / "data" / "raw" / PROJECT
CLEAN_ROOT = ROOT / "data" / "clean" / PROJECT # Cambia ROOT con -> Path("/content/drive/MyDrive/TuaCartella") per salvare sul tuo Drive

RAW_RUN_ID = None   # oppure "20260224_221500"
CLEAN_RUN_ID = None # default: RAW_RUN_ID

DELIM = ";"
ENCODING = "utf-8"

def latest_run_dir(root: Path) -> Path:
    run_dirs = sorted([p for p in root.iterdir() if p.is_dir()], key=lambda p: p.name)
    if not run_dirs:
        raise FileNotFoundError(f"No run dirs in: {root}")
    return run_dirs[-1]

raw_run_dir = latest_run_dir(RAW_ROOT) if RAW_RUN_ID is None else (RAW_ROOT / RAW_RUN_ID)
RAW_RUN_ID = raw_run_dir.name

raw_csv = raw_run_dir / f"{DATASET_SLUG}_raw.csv"
if not raw_csv.exists():
    # fallback
    cands = list(raw_run_dir.glob("*_raw.csv"))
    if not cands:
        raise FileNotFoundError(f"No *_raw.csv in {raw_run_dir}")
    raw_csv = cands[0]

CLEAN_RUN_ID = RAW_RUN_ID if CLEAN_RUN_ID is None else CLEAN_RUN_ID
CLEAN_DIR = CLEAN_ROOT / CLEAN_RUN_ID
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

OUT_PARQUET = CLEAN_DIR / "saldi_storico.parquet"

Mounted at /content/drive


In [2]:
# --- DUCKDB: load raw + macro parse_num ---
con = duckdb.connect()

con.execute(f"""
CREATE OR REPLACE TABLE raw AS
SELECT * FROM read_csv(
  '{raw_csv}',
  delim='{DELIM}',
  header=true,
  all_varchar=true,
  encoding='{ENCODING}'
);
""")

# Macro: una volta sola, poi la riusi su ogni colonna numerica
con.execute(r"""
CREATE OR REPLACE MACRO parse_num(x) AS (
  TRY_CAST(
    CASE
      WHEN x IS NULL THEN NULL
      ELSE
        CASE
          WHEN INSTR(TRIM(x), ',') > 0 AND INSTR(TRIM(x), '.') > 0 THEN
            CASE
              WHEN INSTR(TRIM(x), ',') > INSTR(TRIM(x), '.') THEN
                REPLACE(REPLACE(REPLACE(TRIM(x), ' ', ''), '.', ''), ',', '.')
              ELSE
                REPLACE(REPLACE(TRIM(x), ' ', ''), ',', '')
            END
          WHEN INSTR(TRIM(x), ',') > 0 THEN
            REPLACE(REPLACE(TRIM(x), ' ', ''), ',', '.')
          ELSE
            REPLACE(TRIM(x), ' ', '')
        END
    END
  AS DOUBLE)
);
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7d996506f330>

In [None]:
# --- mapping + clean SELECT ---
import re, unicodedata

def to_snake(s: str) -> str:
    s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore').decode('ascii')
    s = s.strip().lower()
    s = re.sub(r"[^\w]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    if not s: s = "col"
    if s[0].isdigit(): s = "c_" + s
    return s

SEMANTIC_MAP = {
  "ANNO": "esercizio_finanziario",
  "RISPARMIO_PUBBLICO": "risparmio_pubblico",
  "SALDO_NETTO": "saldo_netto_da_finanziare",
  "INDEBITAMENTO_NETTO": "indebitamento_netto",
  "RICORSO_MERCATO": "ricorso_al_mercato",
  "AVANZO_PRIMARIO": "avanzo_primario",
  "SPESE_CORRENTI": "spese_correnti",
  "SPESE_INTERESSI": "spese_per_interessi",
  "SPESE_CONTO_CAPITALE": "spese_in_conto_capitale",
  "SPESE_ACQ_ATT_FINE": "spese_acquisizione_attivita_finanziarie",
  "SPESE_RIMBORSO_PRESTITI": "spese_per_rimborso_prestiti",
  "SPESE_COMPLESSIVE": "spese_complessive",
  "SPESE_FINALI": "spese_finali",
  "SPESE_FIN_NETTO_ATT_FIN": "spese_finali_netto_att_fin",
  "ENTRATE_TRIBUTARIE": "entrate_tributarie",
  "ENTRATE_EXTRA_TRIBUTARIE": "entrate_extra_tributarie",
  "ENTR_ALIEN_PATR_RISCOS": "entrate_alienazioni_patrimoniali_e_riscossioni",
  "RISCOSSIONE_CREDITI": "riscossione_crediti",
  "ENTR_ACCENSIONE_PRESTITI": "entrate_accensione_prestiti",
  "ENTRATE_FINALI": "entrate_finali",
  "ENTR_FIN_NETTO_RISCO_CRED": "entrate_fin_netto_riscossione_crediti",
  "ENTRATE_CORRENTI": "entrate_correnti",
}

raw_cols = [r[1] for r in con.execute("PRAGMA table_info('raw')").fetchall()]

NUMERIC_COLS = set(SEMANTIC_MAP.keys()) 

used=set()
final_map={}
select_exprs=[]

YEAR_COL = next((c for c in raw_cols if c.strip().upper() in {"ANNO","ESERCIZIO","ESERCIZIO_FINANZIARIO"}), None)

for c in raw_cols:
    base = SEMANTIC_MAP.get(c, to_snake(c))
    new = base
    if new in used:
        i=2
        while f"{new}_{i}" in used: i+=1
        new=f"{new}_{i}"
    used.add(new)
    final_map[c]=new

    if YEAR_COL and c==YEAR_COL:
        expr = f'TRY_CAST(TRIM("{c}") AS INTEGER) AS "{new}"'
    elif c in NUMERIC_COLS:
        expr = f'parse_num(TRIM("{c}")) AS "{new}"'
    else:
        expr = f'TRIM(CAST("{c}" AS VARCHAR)) AS "{new}"'
    select_exprs.append(expr)

clean_sql = "CREATE OR REPLACE TABLE clean AS\nSELECT\n  " + ",\n  ".join(select_exprs) + "\nFROM raw;"
con.execute(clean_sql)
con.execute(f"COPY clean TO '{OUT_PARQUET}' (FORMAT PARQUET);")
con.execute("SELECT * FROM clean ORDER BY 1 LIMIT 5").df()


Unnamed: 0,esercizio_finanziario,risparmio_pubblico,saldo_netto_da_finanziare,indebitamento_netto,ricorso_al_mercato,avanzo_primario,spese_correnti,spese_per_interessi,spese_in_conto_capitale,spese_acquisizione_attivita_finanziarie,...,spese_finali,spese_finali_netto_att_fin,entrate_tributarie,entrate_extra_tributarie,entrate_alienazioni_patrimoniali_e_riscossioni,riscossione_crediti,entrate_accensione_prestiti,entrate_finali,entrate_fin_netto_riscossione_crediti,entrate_correnti
0,2003,-23527110000.0,-76608440000.0,-70589530000.0,-333428900000.0,-469519800.0,404491000000.0,76138920000.0,61602070000.0,7116577000.0,...,466093100000.0,458976500000.0,355706600000.0,25257290000.0,8520742000.0,1097664000.0,323025500000.0,389484700000.0,388387000000.0,380963900000.0
1,2004,-30341180000.0,-63777780000.0,-57927610000.0,-298880400000.0,11342170000.0,423863000000.0,75119950000.0,48548640000.0,6051619000.0,...,472411600000.0,466360000000.0,364955500000.0,28566320000.0,15112040000.0,201446900.0,291025400000.0,408633900000.0,408432400000.0,393521800000.0
2,2005,-27020370000.0,-58445410000.0,-55169110000.0,-259929500000.0,17967720000.0,437872600000.0,76413130000.0,47811090000.0,6557305000.0,...,485683700000.0,479126400000.0,381431200000.0,29421020000.0,16386050000.0,3281003000.0,259561000000.0,427238300000.0,423957300000.0,410852200000.0
3,2006,-8512306000.0,-45004370000.0,-45602970000.0,-234128600000.0,30689370000.0,439557800000.0,75693730000.0,39823930000.0,868270200.0,...,479381700000.0,478513400000.0,402347500000.0,28697940000.0,3331873000.0,1466878000.0,235594700000.0,434377300000.0,432910400000.0,431045400000.0
4,2007,15959890000.0,-32446030000.0,-32180000000.0,-223639700000.0,44089970000.0,457766600000.0,76536000000.0,54773460000.0,1020687000.0,...,512540100000.0,511519400000.0,440510300000.0,33216200000.0,6367540000.0,754660400.0,211314100000.0,480094000000.0,479339400000.0,473726500000.0


In [None]:
# --- profile + manifest (senza df in RAM) ---
from datetime import datetime, timezone
import json, hashlib

def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as f:
        for chunk in iter(lambda: f.read(1024*1024), b""):
            h.update(chunk)
    return h.hexdigest()

n_rows = con.execute("SELECT COUNT(*) FROM clean").fetchone()[0]
cols = [r[1] for r in con.execute("PRAGMA table_info('clean')").fetchall()]

# nulls via SQL (dinamico)
null_exprs = ", ".join([f'SUM(CASE WHEN "{c}" IS NULL THEN 1 ELSE 0 END) AS "{c}"' for c in cols])
nulls_row = con.execute(f"SELECT {null_exprs} FROM clean").fetchone()
nulls = dict(zip(cols, map(int, nulls_row)))

# duplicate rows
dup_rows = n_rows - con.execute("SELECT COUNT(*) FROM (SELECT DISTINCT * FROM clean)").fetchone()[0]

sample_rows = con.execute("SELECT * FROM clean LIMIT 10").fetchall()
sample_rows = [dict(zip(cols, r)) for r in sample_rows]

profile_clean = {
    "project": PROJECT,
    "raw_run_id": RAW_RUN_ID,
    "clean_run_id": CLEAN_RUN_ID,
    "raw_csv": str(raw_csv),
    "clean_parquet": str(OUT_PARQUET),
    "n_rows": int(n_rows),
    "n_cols": int(len(cols)),
    "columns": cols,
    "nulls": nulls,
    "duplicate_rows": int(dup_rows),
    "sample_rows": sample_rows,
}

(CLEAN_DIR / "columns_mapping_raw_to_clean.json").write_text(json.dumps(final_map, ensure_ascii=False, indent=2), encoding="utf-8")
(CLEAN_DIR / "profile_clean.json").write_text(json.dumps(profile_clean, ensure_ascii=False, indent=2), encoding="utf-8")

manifest = {
    "project": PROJECT,
    "raw_run_id": RAW_RUN_ID,
    "clean_run_id": CLEAN_RUN_ID,
    "created_utc": datetime.now(timezone.utc).isoformat(),
    "inputs": {"raw_csv": {"path": str(raw_csv), "sha256": sha256_file(raw_csv)}},
    "outputs": {"clean_parquet": {"path": str(OUT_PARQUET), "sha256": sha256_file(OUT_PARQUET)}},
}
(CLEAN_DIR / "clean_manifest.json").write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")

722

Output in Drive:

- `data/clean/<PROJECT>/<RUN_ID>/saldi_storico.parquet`
- `columns_mapping_raw_to_clean.json`
- `profile_clean.json`
- `clean_manifest.json`
