In [13]:
# Libraries
from pathlib import Path
from datetime import datetime
import shutil
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm

In [None]:
# Webscraping
def download_comexstat(
    directory: str = "data",
    start_year: int = 1997,
    end_year: int | None = None,
    wait_timeout: int = 120,
):
    """
    Download ComexStat CSV files using Selenium + Chrome with tqdm progress bar.
    """

    base_dir = Path(directory).resolve()
    base_dir.mkdir(parents=True, exist_ok=True)

    # Clean directory
    for item in base_dir.iterdir():
        if item.is_file() or item.is_symlink():
            item.unlink()
        else:
            shutil.rmtree(item)

    if end_year is None:
        end_year = datetime.now().year

    # Build list of URLs
    downloads = []
    for year in range(start_year, end_year + 1):
        downloads.append(f"https://balanca.economia.gov.br/balanca/bd/comexstat-bd/ncm/IMP_{year}.csv")
        downloads.append(f"https://balanca.economia.gov.br/balanca/bd/comexstat-bd/ncm/EXP_{year}.csv")

    downloads += [
        "https://balanca.economia.gov.br/balanca/bd/tabelas/NCM.csv",
        "https://balanca.economia.gov.br/balanca/bd/tabelas/NCM_SH.csv",
        "https://balanca.economia.gov.br/balanca/bd/tabelas/PAIS.csv",
        "https://balanca.economia.gov.br/balanca/bd/tabelas/PAIS_BLOCO.csv",
        "https://balanca.economia.gov.br/balanca/bd/tabelas/VIA.csv",
        "https://balanca.economia.gov.br/balanca/bd/tabelas/URF.csv",
        "https://balanca.economia.gov.br/balanca/bd/tabelas/UF_MUN.csv",
        "https://balanca.economia.gov.br/balanca/bd/tabelas/UF.csv",
    ]

    # Configure Chrome
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    prefs = {
        "download.default_directory": str(base_dir),
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
    }
    options.add_experimental_option("prefs", prefs)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # tqdm progress bar
        for url in tqdm(downloads, desc="Downloading ComexStat files", unit="file"):

            # Snapshot existing files before request
            before = {f.name for f in base_dir.iterdir() if f.is_file()}

            driver.get(url)

            start_time = time.time()
            while True:
                current = [f.name for f in base_dir.iterdir() if f.is_file()]
                crprogress = [f for f in current if f.endswith(".crdownload")]
                finished = set(current) - before

                if finished and not crprogress:
                    break

                if time.time() - start_time > wait_timeout:
                    break

                time.sleep(0.3)

    finally:
        driver.quit()

download_comexstat()

Downloading ComexStat files: 100%|██████████| 66/66 [08:06<00:00,  7.37s/file]
