# Ingestion Notebook
---
In this notebook we will create a full pipeline to ingest data from public_apis.

## Importing Libraries:

In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
import json
from pathlib import Path
from typing import Any, Dict, Optional, Union, List
import pandas as pd
import requests
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
load_dotenv()

True

## Helper Functions:

In [2]:
PORTAL_TRANSPARENCIA_TOKEN = os.getenv("PORTAL_TRANSPARENCIA_TOKEN", "").strip()

DEFAULT_HEADERS = {
    "User-Agent": "mvp-ingestao-dados/0.1 (+https://example.com)"
}

JSONType = Union[Dict[str, Any], List[Dict[str, Any]]]

class HttpError(RuntimeError):
    pass

@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=1, max=30),
    retry=retry_if_exception_type((HttpError, requests.RequestException)),
)
def safe_get(url: str, *, params: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, timeout: int = 30) -> requests.Response:
    resp = requests.get(url, params=params, headers=headers, timeout=timeout)
    if resp.status_code >= 400:
        raise HttpError(f"GET {url} -> {resp.status_code} {resp.text[:200]}")
    return resp

def save_raw(payload: Union[str, bytes], path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    mode = "wb" if isinstance(payload, (bytes, bytearray)) else "w"
    with open(path, mode) as f:
        f.write(payload)

def save_json(payload: JSONType, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

def save_parquet(df: pd.DataFrame, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype("string")
    df.to_parquet(path, index=False)


## Territórios do Brasil - IPEA:

---

In [3]:
# DOC: https://www.ipeadata.gov.br/api/
BASE_URL = "http://www.ipeadata.gov.br/api/odata4/Territorios"
response = safe_get(BASE_URL)
response_json = response.json()
data = pd.DataFrame(response_json["value"])
data = data.loc[1:, :].reset_index(drop=True)
data["TERAREA"] = data["TERAREA"].astype(float).round(3)
data.head()

Unnamed: 0,NIVNOME,TERCODIGO,TERNOME,TERNOMEPADRAO,TERCAPITAL,TERAREA,NIVAMC
0,Brasil,0,Brasil,BRASIL,False,8531507.6,False
1,Regiões,1,Região Norte,REGIAO NORTE,False,3869637.9,False
2,Estados,11,Rondônia,RONDONIA,False,238512.8,False
3,Municípios,1100015,Alta Floresta D'Oeste,ALTA FLORESTA D'OESTE,False,7111.8,False
4,Municípios,1100023,Ariquemes,ARIQUEMES,False,4995.3,False


## Cotações - BANCO CENTRAL DO BRASIL (BCB)
---

### SELIC - Cotação diária:

In [4]:
# DOC: https://dadosabertos.bcb.gov.br/
BASE_URL = "https://api.bcb.gov.br/dados/serie/bcdata.sgs.{codigo_serie}/dados?formato=json&dataInicial={dataInicial}&dataFinal={dataFinal}"
today = pd.Timestamp.now().normalize().strftime("%d/%m/%Y")
last_30_days = (pd.Timestamp.now() - pd.Timedelta(days=30)).normalize().strftime("%d/%m/%Y")
selic_diaria_url = BASE_URL.format(codigo_serie="11", dataInicial=last_30_days, dataFinal=today)

response = safe_get(selic_diaria_url)
response_json = response.json()
selic_diaria = pd.DataFrame(response_json)
selic_diaria.head()

Unnamed: 0,data,valor
0,20/08/2025,0.055131
1,21/08/2025,0.055131
2,22/08/2025,0.055131
3,25/08/2025,0.055131
4,26/08/2025,0.055131


### SELIC - Cotação mensal:

In [5]:
# DOC: https://dadosabertos.bcb.gov.br/
BASE_URL = "https://api.bcb.gov.br/dados/serie/bcdata.sgs.{codigo_serie}/dados?formato=json&dataInicial={dataInicial}&dataFinal={dataFinal}"
today = pd.Timestamp.now().normalize().strftime("%d/%m/%Y")
last_year = (pd.Timestamp.now() - pd.Timedelta(days=365)).normalize().strftime("%d/%m/%Y")
selic_mensal_url = BASE_URL.format(codigo_serie="4390", dataInicial=last_year, dataFinal=today)

response = safe_get(selic_mensal_url)
response_json = response.json()
selic_mensal = pd.DataFrame(response_json)
selic_mensal.head()

selic_mensal.head()

Unnamed: 0,data,valor
0,01/09/2024,0.84
1,01/10/2024,0.93
2,01/11/2024,0.79
3,01/12/2024,0.93
4,01/01/2025,1.01


### Moedas - Símbolos e Nomes:

In [6]:
# DOC: https://olinda.bcb.gov.br/olinda/servico/PTAX/versao/v1/swagger-ui3#/
BASE_URL = "https://olinda.bcb.gov.br/olinda/servico/PTAX/versao/v1/odata/Moedas"

response = safe_get(BASE_URL)
response_json = response.json()
moedas = pd.DataFrame(response_json['value'])
moedas.head()

Unnamed: 0,simbolo,nomeFormatado,tipoMoeda
0,AUD,Dólar australiano,B
1,CAD,Dólar canadense,A
2,CHF,Franco suíço,A
3,DKK,Coroa dinamarquesa,A
4,EUR,Euro,B


### Dólar - Cotação por Período:

In [7]:
# DOC: https://olinda.bcb.gov.br/olinda/servico/PTAX/versao/v1/swagger-ui3#/
BASE_URL = "https://olinda.bcb.gov.br/olinda/servico/PTAX/versao/v1/odata/CotacaoDolarPeriodo(dataInicial=@dataInicial,dataFinalCotacao=@dataFinalCotacao)?@dataInicial='{dataInicial}'&@dataFinalCotacao='{dataFinalCotacao}'&$format=json"

today = pd.Timestamp.now().normalize().strftime("%m-%d-%Y")
last_30_days = (pd.Timestamp.now() - pd.Timedelta(days=30)).normalize().strftime("%m-%d-%Y")
dolar_periodo_url = BASE_URL.format(dataInicial=last_30_days, dataFinalCotacao=today)

response = safe_get(dolar_periodo_url)
response_json = response.json()
dolar_periodo = pd.DataFrame(response_json['value'])
dolar_periodo.head()

Unnamed: 0,cotacaoCompra,cotacaoVenda,dataHoraCotacao
0,5.472,5.4726,2025-08-20 13:12:26.131
1,5.4822,5.4828,2025-08-21 13:10:49.662
2,5.4386,5.4392,2025-08-22 13:06:18.381
3,5.4168,5.4174,2025-08-25 13:06:50.314
4,5.4212,5.4218,2025-08-26 13:06:46.354


### Outras Moedas - Cotação por Período:

In [8]:
# DOC: https://olinda.bcb.gov.br/olinda/servico/PTAX/versao/v1/swagger-ui3#/
BASE_URL = "https://olinda.bcb.gov.br/olinda/servico/PTAX/versao/v1/odata/CotacaoMoedaPeriodo(moeda=@moeda,dataInicial=@dataInicial,dataFinalCotacao=@dataFinalCotacao)?@moeda='{moeda}'&@dataInicial='{dataInicial}'&@dataFinalCotacao='{dataFinalCotacao}'&$format=json"

today = pd.Timestamp.now().normalize().strftime("%m-%d-%Y")
last_30_days = (pd.Timestamp.now() - pd.Timedelta(days=30)).normalize().strftime("%m-%d-%Y")
dolar_periodo_url = BASE_URL.format(moeda='EUR', dataInicial=last_30_days, dataFinalCotacao=today)

response = safe_get(dolar_periodo_url)
response_json = response.json()
dolar_periodo = pd.DataFrame(response_json['value'])
dolar_periodo.head()

Unnamed: 0,paridadeCompra,paridadeVenda,cotacaoCompra,cotacaoVenda,dataHoraCotacao,tipoBoletim
0,1.167,1.1672,6.3852,6.387,2025-08-20 10:08:27.613,Abertura
1,1.1658,1.1659,6.3824,6.3837,2025-08-20 11:02:27.104,Intermediário
2,1.1662,1.1664,6.3799,6.3817,2025-08-20 12:06:26.755,Intermediário
3,1.1648,1.1649,6.3729,6.3741,2025-08-20 13:12:26.122,Intermediário
4,1.1648,1.1649,6.3738,6.375,2025-08-20 13:12:26.131,Fechamento
