# 02 EDA and core indicators
This notebook covers descriptive statistics, distributions, and
concentration metrics for PIB, PIB per capita, PIB per km2, and IAE.
It uses existing processed artifacts to avoid long compute time.
Data coverage matches `config.yaml` (default 2007â€“2018).

In [None]:
from pathlib import Path
import subprocess
import pandas as pd
import matplotlib.pyplot as plt

ROOT = Path("..").resolve()
CONFIG = ROOT / "config.yaml"
PYTHON = ROOT / ".venv" / "bin" / "python"
AUTO_RUN = False

try:
    from IPython import get_ipython

    _IN_NOTEBOOK = get_ipython() is not None
except Exception:
    _IN_NOTEBOOK = False

if not _IN_NOTEBOOK:
    import matplotlib

    matplotlib.use("Agg")


def run_cmd(args):
    subprocess.run(args, check=True, cwd=ROOT)


def ensure_artifacts(paths, auto_run=True):
    missing = [p for p in paths if not p.exists()]
    if missing and auto_run:
        run_cmd([str(PYTHON), "-m", "run", "all", "--config", str(CONFIG)])
    return missing


indicadores_path = ROOT / "data" / "processed" / "indicadores_core.parquet"
concentracion_path = ROOT / "outputs" / "tables" / "concentracion_topshares.csv"
ensure_artifacts([indicadores_path, concentracion_path], auto_run=AUTO_RUN)

## Load indicators

In [None]:
ind = pd.read_parquet(indicadores_path)
ind.head()

## Latest year summaries
The latest year is a convenient snapshot for ranking and distribution analysis.

In [None]:
year = int(ind["anio"].max())
ind_y = ind[ind["anio"] == year].copy()
ind_y[["ubigeo", "pib_pc", "pib_km2", "iae"]].head()

### Top and bottom territories by PIB per capita
These rankings are used later for tail analysis and policy targeting.

In [None]:
ind_y.nlargest(10, "pib_pc")[["ubigeo", "pib_pc"]]

In [None]:
ind_y.nsmallest(10, "pib_pc")[["ubigeo", "pib_pc"]]

## Distributions
Histograms give a quick view of inequality and heavy tails.

In [None]:
plt.hist(ind_y["pib_pc"], bins=20)
plt.title("PIB per capita distribution")
plt.xlabel("pib_pc")
plt.show()

## Concentration shares
These are the shares of total PIB captured by top percentiles.

In [None]:
conc = pd.read_csv(concentracion_path)
conc.head()

## PIB growth summary
A quick time series of total PIB across years.

In [None]:
series = ind.groupby("anio")["pib"].sum().reset_index()
plt.plot(series["anio"], series["pib"], marker="o")
plt.title("Total PIB by year")
plt.xlabel("anio")
plt.ylabel("pib")
plt.show()