## WITS TradeStats (World Bank)
*Client* mínimo para puxar trade flows por parceiro.

Foco do exemplo:
- Importação e exportação de Ureia (HS6=310210) da Índia (reporter=ind) com todos os parceiros (partner=all).
- Indicadores: MPRT-TRD-VL (import value) e XPRT-TRD-VL (export value).

Referências (API oficial):
- Base URL: `https://wits.worldbank.org/API/V1`
- TradeStats data request (SDMX/V21/datasource/...): ver [WITS API User Guide](https://wits.worldbank.org/data/public/WITSAPI_UserGuide.pdf).

In [None]:
import time
from dataclasses import dataclass
from typing import Iterable, Literal, Optional
import io
import pandas as pd
import requests
import numpy as np
import os

TradeFlow = Literal["Imports", "Exports"]

In [None]:
def _safe_mkdir(path: str) -> None:
    os.makedirs(path, exist_ok=True)

@dataclass
class WITSComtradeClient:
    """
    Client para extrair dados de comércio (UN Comtrade via WITS) por HS6 e parceiro,
    usando as páginas públicas do WITS no formato:

        https://wits.worldbank.org/trade/comtrade/en/country/{REPORTER}/year/{YEAR}/tradeflow/{FLOW}/partner/{PARTNER}/product/{HS6}

    Isso é útil quando:
    - a API oficial do WITS (TradeStats) não aceita HS6 leaf-level (ex.: 310210);
    - você quer obter rapidamente a tabela "por parceiro" que aparece no site.

    Observações
    ----------
    - O WITS mostra uma tabela HTML com colunas como:
      Reporter, TradeFlow, ProductCode, Product Description, Year, Partner, Trade Value 1000USD, Quantity, Quantity Unit
    - Este client faz scraping leve (requests + pandas.read_html). Use com parcimônia (sleep entre requests).
    """

    base_url: str = "https://wits.worldbank.org"
    timeout_s: int = 60
    max_retries: int = 4
    backoff: float = 1.7
    sleep_between_calls_s: float = 0.4  # gentileza com o servidor

    def build_url(
        self,
        *,
        reporter_iso3: str,
        year: int,
        tradeflow: TradeFlow,
        partner: str = "ALL",
        hs6: str,
        language: str = "en",
    ) -> str:
        """
        Monta a URL da página WITS Comtrade.

        Parameters
        ----------
        reporter_iso3:
            ISO3 do país reportador (ex.: "IND").
        year:
            Ano (ex.: 2023).
        tradeflow:
            "Imports" ou "Exports" (exatamente nesse plural).
        partner:
            "ALL" para todos os parceiros (ou "WLD" para mundo).
        hs6:
            Código HS6 (ex.: "310210" para ureia).
        language:
            Idioma na URL (padrão "en").

        Returns
        -------
        str
            URL pronta para GET.
        """
        reporter_iso3 = reporter_iso3.upper().strip()
        partner = partner.upper().strip()
        hs6 = str(hs6).strip()

        return (
            f"{self.base_url}/trade/comtrade/{language}/country/{reporter_iso3}"
            f"/year/{int(year)}/tradeflow/{tradeflow}/partner/{partner}/product/{hs6}"
        )

    def _get_with_retries(self, url: str) -> str:
        """
        GET com retries e backoff para erros transitórios.

        Raises
        ------
        RuntimeError
            Se falhar após max_retries.
        """
        last_err: Optional[Exception] = None
        for attempt in range(self.max_retries + 1):
            try:
                r = requests.get(url, timeout=self.timeout_s, headers={"User-Agent": "Mozilla/5.0"})
                if r.status_code == 200 and r.text:
                    return r.text
                if r.status_code in (429, 500, 502, 503, 504):
                    raise requests.HTTPError(f"HTTP {r.status_code}")
                raise requests.HTTPError(f"HTTP {r.status_code}: {r.text[:300]}")
            except Exception as e:
                last_err = e
                if attempt >= self.max_retries:
                    break
                time.sleep((self.backoff ** attempt))
        raise RuntimeError(f"Falha ao baixar: {url}") from last_err

    def fetch_partner_table(
        self,
        *,
        reporter_iso3: str,
        year: int,
        tradeflow: TradeFlow,
        hs6: str,
        partner: str = "ALL",
    ) -> pd.DataFrame:
        """
        Baixa e parseia a tabela "por parceiro" para um (reporter, ano, fluxo, hs6).

        Parameters
        ----------
        reporter_iso3:
            ISO3 do país reportador (ex.: "IND").
        year:
            Ano da consulta.
        tradeflow:
            "Imports" ou "Exports".
        hs6:
            Código HS6 (ex.: "310210").
        partner:
            "ALL" (todos os parceiros) é o mais comum aqui.

        Returns
        -------
        pd.DataFrame
            DataFrame com colunas padronizadas e tipadas:
            - reporter, tradeflow, hs6, product_desc, year, partner, value_1000usd, value_usd, quantity, quantity_unit

        Notes
        -----
        - Algumas linhas podem vir sem quantity (ex.: Partner = World / totals).
        """
        url = self.build_url(
            reporter_iso3=reporter_iso3,
            year=year,
            tradeflow=tradeflow,
            partner=partner,
            hs6=hs6,
        )
        html = self._get_with_retries(url)

        # pandas encontra tabelas HTML (geralmente a primeira é a desejada)
        tables = pd.read_html(io.StringIO(html), header=0)
        if not tables:
            raise ValueError(f"Nenhuma tabela encontrada em: {url}")

        df = tables[0].copy()

        # Padroniza nomes possíveis
        rename_map = {
            "Reporter": "reporter",
            "TradeFlow": "tradeflow",
            "ProductCode": "hs6",
            "Product Description": "product_desc",
            "Year": "year",
            "Partner": "partner",
            "Trade Value 1000USD": "value_1000usd",
            "Quantity": "quantity",
            "Quantity Unit": "quantity_unit",
        }
        # aplica apenas para colunas presentes
        df = df.rename(columns={c: rename_map.get(c, c) for c in df.columns})

        # Garante colunas esperadas (mesmo que ausentes)
        for c in ["reporter", "tradeflow", "hs6", "product_desc", "year", "partner", "value_1000usd", "quantity", "quantity_unit"]:
            if c not in df.columns:
                df[c] = pd.NA

        # Tipagem/limpeza numérica
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
        df["value_1000usd"] = (
            df["value_1000usd"]
            .astype(str)
            .str.replace(",", "", regex=False)
            .replace({"nan": None})
        )
        df["value_1000usd"] = pd.to_numeric(df["value_1000usd"], errors="coerce")
        df["value_usd"] = df["value_1000usd"] * 1000.0

        df["quantity"] = (
            df["quantity"]
            .astype(str)
            .str.replace(",", "", regex=False)
            .replace({"nan": None})
        )
        df["quantity"] = pd.to_numeric(df["quantity"], errors="coerce")

        # Metadados consistentes
        df["tradeflow"] = df["tradeflow"].fillna(tradeflow)
        df["reporter"] = df["reporter"].fillna(reporter_iso3.upper())
        df["hs6"] = df["hs6"].fillna(str(hs6))

        time.sleep(self.sleep_between_calls_s)
        return df[
            ["reporter", "tradeflow", "hs6", "product_desc", "year", "partner", "value_1000usd", "value_usd", "quantity", "quantity_unit"]
        ]

    def fetch_india_urea_all_partners(
        self,
        *,
        start_year: int,
        end_year: int,
        hs6: str = "310210",
        include_imports: bool = True,
        include_exports: bool = True,
    ) -> pd.DataFrame:
        """
        Conveniência: Índia (IND) + ureia HS6 + todos parceiros, para um range de anos.

        Parameters
        ----------
        start_year, end_year:
            Intervalo [start_year, end_year].
        hs6:
            HS6 do produto (ureia = "310210").
        include_imports/include_exports:
            Controla quais fluxos retornar.

        Returns
        -------
        pd.DataFrame
            DF longo com uma linha por (ano, parceiro, fluxo).
        """
        years = range(int(start_year), int(end_year) + 1)
        flows: list[TradeFlow] = []
        if include_imports:
            flows.append("Imports")
        if include_exports:
            flows.append("Exports")

        all_dfs = []
        for y in years:
            for f in flows:
                all_dfs.append(
                    self.fetch_partner_table(
                        reporter_iso3="IND",
                        year=y,
                        tradeflow=f,
                        hs6=hs6,
                        partner="ALL",
                    )
                )

        if not all_dfs:
            return pd.DataFrame()

        out = pd.concat(all_dfs, ignore_index=True)

        # útil: converte kg -> toneladas quando aplicável
        out["quantity_tonnes"] = pd.NA
        mask_kg = out["quantity_unit"].astype(str).str.lower().eq("kg")
        out.loc[mask_kg, "quantity_tonnes"] = out.loc[mask_kg, "quantity"] / 1000.0

        # preço implícito (USD/ton) quando houver quantidade
        out["unit_value_usd_per_tonne"] = out["value_usd"].div(out["quantity_tonnes"].replace(0, np.nan))

        return out

In [None]:
DATA_DIR = os.path.abspath("data")

_safe_mkdir(DATA_DIR)

client = WITSComtradeClient(sleep_between_calls_s=0.5)

df = client.fetch_india_urea_all_partners(
    start_year=1995,
    end_year=2025,
    hs6="310210",
    include_imports=True,
    include_exports=True,
)

df

    year tradeflow     value_usd quantity_tonnes unit_value_usd_per_tonne
50  2020    Export  5.546154e+07      193334.767               286.867907
51  2020    Import  5.873366e+09    11152627.206               526.635169
52  2021    Export  2.763060e+07        28479.81               970.182034
53  2021    Import  8.336641e+09     8113297.385              1027.528119
54  2022    Export  9.759656e+07       128227.83               761.118394
55  2022    Import  1.413022e+10     10101038.57              1398.887659
56  2023    Export  8.084033e+07      149736.225               539.884921
57  2023    Import  6.874598e+09      8598248.72               799.534682
58  2024    Export  4.182406e+07      104588.601               399.891189
59  2024    Import  4.326880e+09     6519838.009               663.648444


In [None]:
df.to_parquet(os.path.join(DATA_DIR, "india_urea_hs6_by_partner_wits.parquet"), index=False)
df.to_csv(os.path.join(DATA_DIR, "india_urea_hs6_by_partner_wits.csv"), index=False)