In [1]:
import asyncio
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

import fsspec
from osgeo import gdal
from pyproj import CRS, Transformer
from tqdm.notebook import tqdm

gdal.UseExceptions()
res_folder = "../data/SpatialData/SoilGrids/"
Path(res_folder).mkdir(parents=True, exist_ok=True)
# ----------------------------------------------------------------------
# SoilGrids 250 m v2.0 native CRS = Goode Interrupted Homolosine
SOILGRIDS_CRS = CRS.from_proj4(
    "+proj=igh +lon_0=0 +datum=WGS84 +units=m +no_defs"
)  # equivalent to EPSG:152160 but portable

WGS84 = CRS.from_epsg(4326)
TRANS = Transformer.from_crs(WGS84, SOILGRIDS_CRS, always_xy=True)
# ----------------------------------------------------------------------


def bbox_lonlat_to_homolosine(
    north: float, west: float, south: float, east: float, *, for_gdal_projwin: bool = True
) -> tuple[float, float, float, float]:
    """
    Convert [N, W, S, E] geographic degrees to SoilGrids Homolosine metres.

    Returns (min_x, max_y, max_x, min_y) if *for_gdal_projwin*,
    otherwise (min_x, min_y, max_x, max_y).

    Raises
    ------
    ValueError – if the AOI crosses a Homolosine lobe (rare; split AOI yourself)
    """
    # Transform the two diagonal corners
    x_ul, y_ul = TRANS.transform(west, north)  # upper-left
    x_lr, y_lr = TRANS.transform(east, south)  # lower-right

    # If the AOI straddles an interruption the x-ordering flips → width<0
    if x_lr < x_ul:
        raise ValueError("AOI crosses an interrupted lobe – split it in two.")

    min_x, max_x = x_ul, x_lr
    max_y, min_y = y_ul, y_lr

    if for_gdal_projwin:  # GDAL's -projwin (ulx uly lrx lry)
        return min_x, max_y, max_x, min_y
    else:  # GDAL's -te  (minx miny maxx maxy)
        return min_x, min_y, max_x, max_y


# Example AOI in lat/lon
aoi = (70.0, 10.0, 42.0, 45.0)
bb = bbox_lonlat_to_homolosine(*aoi)
bb

(2323203.209506912, 7437681.476252508, 4589519.30641518, 4674913.235357148)

In [2]:
BASE = "https://files.isric.org/soilgrids/latest/data"
fs = fsspec.filesystem("https")  # anonymous HTTPS

# ---------- what to keep / change -------------------------------------------
ALLOWED_PROPERTIES = {
    "bdod",  # Bulk density (kg/m3)
    "cec",  # Cation exchange capacity (cmol/kg)
    "cfvo",  # Coarse fragments volume (%)
    "clay",  # Clay content (%)
    "nitrogen",  # Total nitrogen content (g/kg)
    "ocd",  # Organic carbon density (kg/m2)
    "ocs",  # Organic carbon stock (ton/ha)
    "phh2o",  # pH in H2O
    "sand",  # Sand content (%)
    "silt",  # Silt content (%)
    "soc",  # Soil organic carbon content (g/kg)
}
ALLOWED_DEPTHS = None  # e.g. {"0-5cm", "5-15cm", "15-30cm", "30-60cm", "60-100cm", "100-200cm"}
ALLOWED_STATS = {"mean"}  # e.g. {
#     "mean",       # point prediction (expected value)
#     "Q0.05",      # lower 5 % conditional quantile
#     "Q0.5",       # median (50 % quantile)
#     "Q0.95",      # upper 95 % conditional quantile
#     "uncertainty" # model-based prediction interval width
# }

coverages: dict[str, list[str]] = defaultdict(list)  # result container

for prop_url in fs.ls(BASE, detail=False):
    prop = prop_url.rstrip("/").split("/")[-1]

    # skip categorical layers and not-wanted properties early
    if prop in {"landmask", "wrb"} or (ALLOWED_PROPERTIES and prop not in ALLOWED_PROPERTIES):
        continue

    for fn in fs.ls(prop_url, detail=False):
        if not fn.endswith(".vrt"):
            continue

        name = fn.split("/")[-1].removesuffix(".vrt")
        prop_, depth_, stat_ = name.split("_", 2)

        if (ALLOWED_DEPTHS is None or depth_ in ALLOWED_DEPTHS) and (
            ALLOWED_STATS is None or stat_ in ALLOWED_STATS
        ):
            coverages[prop_].append(name)

In [3]:
IGH = "+proj=igh +lat_0=0 +lon_0=0 +datum=WGS84 +units=m +no_defs"
RES = 250
SG_URL = f"/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url={BASE}"
kwargs = {
    "format": "GTiff",
    "projWin": bb,
    "projWinSRS": IGH,
    "xRes": RES,
    "yRes": RES,
    "creationOptions": ["TILED=YES", "COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
}


def process_layer(var: str, layer: str, var_folder: Path, SG_URL: str, kwargs: dict) -> None:
    """
    Download and reproject a single SoilGrids layer.

    Args:
        var (str): Soil property name.
        layer (str): Layer identifier.
        var_folder (Path): Output directory for the variable.
        SG_URL (str): Base SoilGrids URL.
        kwargs (dict): GDAL translate options.

    Raises:
        Exception: If GDAL processing fails.
    """
    try:
        out_path = var_folder / f"{layer}.tif"
        ds = gdal.Translate(str(out_path), SG_URL + f"/{var}/{layer}.vrt", **kwargs)
        del ds  # flush contents
        ds = gdal.Warp(str(out_path), str(out_path), dstSRS="EPSG:4326")
        del ds
    except Exception as e:
        print(f"Error processing {var}/{layer}: {e}")


for var, layers in coverages.items():
    var_folder = Path(res_folder) / var
    var_folder.mkdir(parents=True, exist_ok=True)

    tasks = [(var, layer, var_folder, SG_URL, kwargs) for layer in layers]

    with (
        ThreadPoolExecutor(max_workers=len(layers)) as executor,
        tqdm(total=len(tasks), desc=f"Processing {var}", unit="layer") as pbar,
    ):
        futures = [executor.submit(process_layer, *task) for task in tasks]
        for future in as_completed(futures):
            exc = future.exception()
            if exc:
                print(f"Layer processing failed: {exc}")
            pbar.update(1)


Processing bdod:   0%|          | 0/6 [00:00<?, ?layer/s]

Processing cec:   0%|          | 0/6 [00:00<?, ?layer/s]

Processing cfvo:   0%|          | 0/6 [00:00<?, ?layer/s]



Processing clay:   0%|          | 0/6 [00:00<?, ?layer/s]



Processing nitrogen:   0%|          | 0/6 [00:00<?, ?layer/s]



Processing ocd:   0%|          | 0/6 [00:00<?, ?layer/s]



Processing ocs:   0%|          | 0/1 [00:00<?, ?layer/s]

Processing phh2o:   0%|          | 0/6 [00:00<?, ?layer/s]



Processing sand:   0%|          | 0/6 [00:00<?, ?layer/s]



Processing silt:   0%|          | 0/6 [00:00<?, ?layer/s]



Processing soc:   0%|          | 0/6 [00:00<?, ?layer/s]



In [None]:
igh = "+proj=igh +lat_0=0 +lon_0=0 +datum=WGS84 +units=m +no_defs"
res = 250

sg_url = f"/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url={BASE}"
kwargs = {
    "format": "GTiff",
    "projWin": bb,
    "projWinSRS": igh,
    "xRes": res,
    "yRes": res,
    "creationOptions": ["TILED=YES", "COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
}
var_folder = Path(res_folder) / "ocs"
var_folder.mkdir(parents=True, exist_ok=True)

ds = gdal.Translate(f"{var_folder}/crop_roi_igh_py.tif", sg_url + "bdod/bdod_0-5cm_mean.vrt", **kwargs)
del ds  # flush contents
ds = gdal.Warp(
    f"{var_folder}/crop_roi_igh_py.tif", f"{var_folder}/crop_roi_igh_py.tif", dstSRS="EPSG:4326"
)
del ds


In [None]:
asyncio.run(
    download_soilgrids(
        properties={"bdod"},  # only bulk density
        stats={"mean"},  # only mean surfaces
        mode="cog",  # pull full GeoTIFFs
        concurrent=3,  # keep the object store happy
        target_dir="soilgrids_bdod_mean",
    )
)

In [None]:
# Create a SoilGrids instance
"""
    layers : list of str
        SoilGrids layers to get. Available options are:
        ``bdod_*``, ``cec_*``, ``cfvo_*``, ``clay_*``, ``nitrogen_*``, ``ocd_*``,
        ``ocs_*``, ``phh2o_*``, ``sand_*``, ``silt_*``, and ``soc_*``
        represents depths in cm and can be one of ``5`` (0-5 cm), ``15``
        (5-15 cm), ``30`` (15-30 cm), ``60`` (30-60 cm), ``100`` (60-100 cm), or ``200``
        (100-200 cm). For example, ``bdod_5`` is the mean bulk density of
        the fine earth fraction at 0-5 cm depth, and ``bdod_200`` is the mean bulk
        density of the fine earth fraction at 100-200 cm depth.
"""

layer_of_interest = [
    f"{fraction}_{depth}_{tag}"
    for fraction in [
        "bdod",
        "cec",
        "cfvo",
        "clay",
        "nitrogen",
        "ocd",
        "ocs",
        "phh2o",
        "sand",
        "silt",
        "soc",
    ]
    for depth in ["0-5cm", "5-15cm", "15-30cm", "30-60cm", "60-100cm", "100-200cm"]
    for tag in ["mean"]
]


['bdod_0-5cm_mean',
 'bdod_5-15cm_mean',
 'bdod_15-30cm_mean',
 'bdod_30-60cm_mean',
 'bdod_60-100cm_mean',
 'bdod_100-200cm_mean',
 'cec_0-5cm_mean',
 'cec_5-15cm_mean',
 'cec_15-30cm_mean',
 'cec_30-60cm_mean',
 'cec_60-100cm_mean',
 'cec_100-200cm_mean',
 'cfvo_0-5cm_mean',
 'cfvo_5-15cm_mean',
 'cfvo_15-30cm_mean',
 'cfvo_30-60cm_mean',
 'cfvo_60-100cm_mean',
 'cfvo_100-200cm_mean',
 'clay_0-5cm_mean',
 'clay_5-15cm_mean',
 'clay_15-30cm_mean',
 'clay_30-60cm_mean',
 'clay_60-100cm_mean',
 'clay_100-200cm_mean',
 'nitrogen_0-5cm_mean',
 'nitrogen_5-15cm_mean',
 'nitrogen_15-30cm_mean',
 'nitrogen_30-60cm_mean',
 'nitrogen_60-100cm_mean',
 'nitrogen_100-200cm_mean',
 'ocd_0-5cm_mean',
 'ocd_5-15cm_mean',
 'ocd_15-30cm_mean',
 'ocd_30-60cm_mean',
 'ocd_60-100cm_mean',
 'ocd_100-200cm_mean',
 'ocs_0-5cm_mean',
 'ocs_5-15cm_mean',
 'ocs_15-30cm_mean',
 'ocs_30-60cm_mean',
 'ocs_60-100cm_mean',
 'ocs_100-200cm_mean',
 'phh2o_0-5cm_mean',
 'phh2o_5-15cm_mean',
 'phh2o_15-30cm_mean',
 'ph

In [18]:
# Define the AOI (Area of Interest) with [North, West, South, East] coordinates
aoi = (70.0, 20.0, 42.0, 45.0)

# Create a SoilGrids instance
"""
    layers : list of str
        SoilGrids layers to get. Available options are:
        ``bdod_*``, ``cec_*``, ``cfvo_*``, ``clay_*``, ``nitrogen_*``, ``ocd_*``,
        ``ocs_*``, ``phh2o_*``, ``sand_*``, ``silt_*``, and ``soc_*``
        represents depths in cm and can be one of ``5`` (0-5 cm), ``15``
        (5-15 cm), ``30`` (15-30 cm), ``60`` (30-60 cm), ``100`` (60-100 cm), or ``200``
        (100-200 cm). For example, ``bdod_5`` is the mean bulk density of
        the fine earth fraction at 0-5 cm depth, and ``bdod_200`` is the mean bulk
        density of the fine earth fraction at 100-200 cm depth.
"""
layer_of_interest = [
    f"{fraction}_{depth}"
    for fraction in [
        "bdod",
        "cec",
        "cfvo",
        "clay",
        "nitrogen",
        "ocd",
        "ocs",
        "phh2o",
        "sand",
        "silt",
        "soc",
    ]
    for depth in [5, 15, 30, 60, 100, 200]
]
layer_of_interest

['bdod_5',
 'bdod_15',
 'bdod_30',
 'bdod_60',
 'bdod_100',
 'bdod_200',
 'cec_5',
 'cec_15',
 'cec_30',
 'cec_60',
 'cec_100',
 'cec_200',
 'cfvo_5',
 'cfvo_15',
 'cfvo_30',
 'cfvo_60',
 'cfvo_100',
 'cfvo_200',
 'clay_5',
 'clay_15',
 'clay_30',
 'clay_60',
 'clay_100',
 'clay_200',
 'nitrogen_5',
 'nitrogen_15',
 'nitrogen_30',
 'nitrogen_60',
 'nitrogen_100',
 'nitrogen_200',
 'ocd_5',
 'ocd_15',
 'ocd_30',
 'ocd_60',
 'ocd_100',
 'ocd_200',
 'ocs_5',
 'ocs_15',
 'ocs_30',
 'ocs_60',
 'ocs_100',
 'ocs_200',
 'phh2o_5',
 'phh2o_15',
 'phh2o_30',
 'phh2o_60',
 'phh2o_100',
 'phh2o_200',
 'sand_5',
 'sand_15',
 'sand_30',
 'sand_60',
 'sand_100',
 'sand_200',
 'silt_5',
 'silt_15',
 'silt_30',
 'silt_60',
 'silt_100',
 'silt_200',
 'soc_5',
 'soc_15',
 'soc_30',
 'soc_60',
 'soc_100',
 'soc_200']