In [1]:
import re

import geopandas as gpd
import numpy as np
import pandas as pd

In [2]:
def read_raw_data(data_filepath):
    df = pd.read_csv(
        data_filepath,
        compression="zip",
        sep=";",
        decimal=",",
        thousands=".",
    )

    m = re.compile(r"^([\dA-Z_]+) - (.*)")

    def get_variable_code(x):
        mm = m.match(x)
        if mm:
            return mm.group(1)
        return x

    df = df.rename(columns=get_variable_code)

    return df


def read_shp(shp_filepath):
    shp = gpd.read_file(shp_filepath).assign(
        CD_MUN=lambda x: x["CD_MUN"].astype("int64")
    )
    return shp


def read_ipca(ipca_filepath):
    def convert_float(x):
        try:
            return float(x)
        except:
            return np.nan

    def convert_date(x):
        try:
            return pd.to_datetime(x, format="%Y%m")
        except:
            return pd.NaT

    ipca = (
        pd.read_csv(
            ipca_filepath,
            skiprows=1,
            usecols=["Mês (Código)", "Valor"],
            dtype=str,
        )
        .assign(
            data=lambda x: x["Mês (Código)"].apply(convert_date),
            variacao=lambda x: x["Valor"].apply(convert_float) / 100,
            fator=lambda x: 1 + x["variacao"],
        )[["data", "variacao", "fator"]]
        .query("data >= '1995-01-01' & data <= '2021-12-31'")
        .set_index("data")
        .resample("YS")
        .prod()
        .assign(
            _deflator=lambda x: x["fator"].cumprod(),
            base=lambda x: x["_deflator"].loc["2021-01-01"],
            deflator=lambda x: x["_deflator"] / x["base"],
        )[["deflator"]]
        .reset_index()
        .assign(data=lambda x: x["data"].dt.year)
    )
    return ipca


def read_municipio(municipio_filepath):
    municipio = pd.read_csv(
        municipio_filepath,
        compression="gzip",
        usecols=["id_municipio", "id_municipio_6", "sigla_uf", "nome_regiao"],
    ).drop_duplicates()
    return municipio

In [3]:
data_filepath = "data/agua-esgoto-desagregado.zip"
ipca_filepath = "data/ipca.csv"
municipio_filepath = "data/municipio.csv.gz"

# READ
df = read_raw_data(data_filepath)
ipca = read_ipca(ipca_filepath)
municipio = read_municipio(municipio_filepath)

# PROCESS
df = df.merge(
    municipio,
    left_on=["codigo_municipio", "sigla_uf"],
    right_on=["id_municipio_6", "sigla_uf"],
    how="left",
)

populacao_municipio = (
    df[
        [
            "ano_referencia",
            "id_municipio",
            "POP_TOT",
        ]
    ]
    .drop_duplicates()
    .dropna()
    .groupby(["ano_referencia", "id_municipio"])
    .mean()
    .reset_index()
)

ag001 = (
    df[
        [
            "ano_referencia",
            "id_municipio",
            "AG001",
        ]
    ]
    .groupby(["ano_referencia", "id_municipio"])
    .sum()
    .reset_index()
)

es001 = (
    df[
        [
            "ano_referencia",
            "id_municipio",
            "ES001",
        ]
    ]
    .groupby(["ano_referencia", "id_municipio"])
    .sum()
    .reset_index()
)

populacao_municipio_atendida = pd.concat(
    (
        populacao_municipio.set_index(["ano_referencia", "id_municipio"]),
        ag001.set_index(["ano_referencia", "id_municipio"]),
        es001.set_index(["ano_referencia", "id_municipio"]),
    ),
    axis=1,
).reset_index()

volume_agua_consumido = (
    df[
        [
            "ano_referencia",
            "id_municipio",
            "AG010",
        ]
    ]
    .groupby(["ano_referencia", "id_municipio"])
    .sum()
    .reset_index()
)

# SAVE PROCESSED DATA
ipca.to_csv("data/processed_ipca.csv", index=False)
municipio.to_csv("data/processed_municipio.csv", index=False)
populacao_municipio_atendida.to_csv(
    "data/processed_populacao_atendida.csv", index=False
)
volume_agua_consumido.to_csv(
    "data/processed_volume_agua_consumido.csv", index=False
)

  df = pd.read_csv(
