Este notebook preprocesa los microdatos del Censo 2024 de Chile. 

Lee los archivos de hogares, personas y viviendas en formato Parquet, convierte los datos numéricos a enteros para facilitar el mapeo de categorías.


Nota: este notebook necesita el archivo `viv_hog_per_censo2024.zip` descomprimido en el directorio `data/raw/`.

```
$ mkdir -p data/raw
$ cd data/raw
$ curl -O https://storage.googleapis.com/bktdescargascenso2024/viv_hog_per_censo2024.zip
$ unzip viv_hog_per_censo2024.zip
```

In [1]:
from pathlib import Path
import pandas as pd

RAW_DIR = Path("../data/raw")
PROCESSED_DIR = Path("../data/processed")

data_files = [
    RAW_DIR / "hogares_censo2024.parquet",
    RAW_DIR / "personas_censo2024.parquet",
    RAW_DIR / "viviendas_censo2024.parquet",
]
output_files = [
    PROCESSED_DIR / "hogares_clean.parquet",
    PROCESSED_DIR / "personas_clean.parquet",
    PROCESSED_DIR / "viviendas_clean.parquet",
]

In [2]:
def to_int_if_possible(series: pd.Series) -> pd.Series:
    try:
        series = series.fillna(pd.NA)
        series = series.astype("Int32")
        return series
    except ValueError:
        print(
            f"Conversion to Int32 failed for series '{series.name}'; returning original."
        )
        return series


to_int_if_possible(pd.Series(["1", "2", "3", None, "blah"], name="example_series")).map(
    lambda x: type(x)
)
to_int_if_possible(pd.Series([1, 2, 3, None])).map(lambda x: type(x))

Conversion to Int32 failed for series 'example_series'; returning original.


0    <class 'float'>
1    <class 'float'>
2    <class 'float'>
3    <class 'float'>
dtype: object

In [3]:
def read_parquet(filepath, filters=None):
    df = pd.read_parquet(filepath, filters=filters, read_dictionary=None)
    for col in df.select_dtypes(include="number").columns:
        df[col] = to_int_if_possible(df[col])
    return df


test = read_parquet(
    data_files[0],
    # filters=[[("region", "==", 13)]],
    filters=[[("comuna", "==", 5802)]],
)
test

Unnamed: 0,id_vivienda,id_hogar,region,provincia,comuna,comuna_bajo_umbral,area,tipo_operativo,p12_tenencia_viv,p13_comb_cocina,p14_comb_calefaccion,p15a_serv_tel_movil,p15b_serv_compu,p15c_serv_tablet,p15d_serv_internet_fija,p15e_serv_internet_movil,p15f_serv_internet_satelital,tipologia_hogar
0,1,1,5,58,5802,2,1,2,4,1,8,1,2,2,2,1,2,5
1,598,1,5,58,5802,2,1,2,4,1,6,1,2,2,1,2,2,1
2,1100,1,5,58,5802,2,1,2,4,1,8,1,1,1,1,1,2,2
3,1346,1,5,58,5802,2,1,2,1,1,3,1,1,1,1,1,2,5
4,1847,1,5,58,5802,2,1,2,3,1,6,1,2,2,1,1,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19911,7662922,1,5,58,5802,2,1,2,1,1,7,1,2,2,2,1,1,3
19912,7663845,1,5,58,5802,2,1,2,1,1,1,1,1,2,2,1,2,4
19913,7664248,1,5,58,5802,2,1,2,1,1,8,2,2,2,2,2,2,1
19914,7664257,1,5,58,5802,2,1,2,1,1,3,1,1,2,1,1,2,6


In [4]:
for input_file, output_file in zip(data_files, output_files):
    print(f"Processing {input_file}...")
    df = read_parquet(input_file)
    df.to_parquet(output_file, index=False, compression="zstd")
    print(f"Saved processed data to {output_file}.")

Processing ../data/raw/hogares_censo2024.parquet...
Saved processed data to ../data/processed/hogares_clean.parquet.
Processing ../data/raw/personas_censo2024.parquet...
Saved processed data to ../data/processed/personas_clean.parquet.
Processing ../data/raw/viviendas_censo2024.parquet...
Saved processed data to ../data/processed/viviendas_clean.parquet.
