Recommended way to use the training observations datasets :
 
1) Install package pip install git+https://github.com/ecmwf/anemoi-datasets@observations

2) Set up dataset path in your config file ~/.config/anemoi/settings.toml (https://anemoi.readthedocs.io/projects/datasets/en/latest/using/configuration.html)

```
    $ cat ~/.config/anemoi/settings.toml 
    [datasets] 
    path = ["/path/to/datasests"]
```
 
3) Check that the dataset exists: ls /path/to/datasets/observations-ea-ofb-0001-2007-2021-metop-a-iasi-radiances-v1.zarr
 

In [1]:
# (C) Copyright 2025 European Centre for Medium-Range Weather Forecasts.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

from anemoi.datasets import open_dataset

import numpy as np

In [2]:
ds = open_dataset("observations-ea-ofb-0001-2007-2021-metop-a-iasi-radiances-v1")

Frequency not provided, using the one from the dataset: None
Frequency not provided in the dataset, using the default : 6h


TRACE: ObservationsZarr(<zarr.hierarchy.Group '/' read-only>, None, None)
TRACE: Using experimental dataset ObsDataset(['/home/mlx/ai-ml/datasets/observations-ea-ofb-0001-2007-2021-metop-a-iasi-radiances-v1.zarr', 20070227180000, 20210908060000], {'len_hrs': 6, 'step_hrs': 6})


In [3]:
# Dataset contain n time windows    
print(f"Observations Dataset with {len(ds)} items")

Observations Dataset with 21226 items


In [4]:
# Each window is associated with a given date
print(f"Dates: {ds.dates[0]}, {ds.dates[1]}, ..., {ds.dates[-2]}, {ds.dates[-1]}")
len(ds.dates)

Dates: 2007-02-28T00:00:00, 2007-02-28T06:00:00, ..., 2021-09-08T00:00:00, 2021-09-08T06:00:00


21226 (21_226)

In [5]:
# Variable names are available with ds.variables:
print(ds.variables)

['healpix_idx_8', 'seqno', 'lat', 'lon', 'zenith', 'solar_zenith', 'avhrr_mean_vis_0', 'rawbt_16', 'rawbt_63', 'rawbt_138', 'rawbt_170', 'rawbt_185', 'rawbt_224', 'rawbt_249', 'rawbt_271', 'rawbt_445', 'rawbt_756', 'rawbt_867', 'rawbt_921', 'rawbt_2907', 'rawbt_2991', 'rawbt_3093', 'rawbt_3160', 'rawbt_5383', 'cos_julian_day', 'sin_julian_day', 'cos_local_time', 'sin_local_time', 'cos_sza', 'cos_latitude', 'sin_latitude', 'cos_longitude', 'sin_longitude', 'cos_vza']


In [6]:
# ds.name_to_index allow knowing which data is where
print(ds.name_to_index)
len(ds.name_to_index)

{'healpix_idx_8': 0, 'seqno': 1, 'lat': 2, 'lon': 3, 'zenith': 4, 'solar_zenith': 5, 'avhrr_mean_vis_0': 6, 'rawbt_16': 7, 'rawbt_63': 8, 'rawbt_138': 9, 'rawbt_170': 10, 'rawbt_185': 11, 'rawbt_224': 12, 'rawbt_249': 13, 'rawbt_271': 14, 'rawbt_445': 15, 'rawbt_756': 16, 'rawbt_867': 17, 'rawbt_921': 18, 'rawbt_2907': 19, 'rawbt_2991': 20, 'rawbt_3093': 21, 'rawbt_3160': 22, 'rawbt_5383': 23, 'cos_julian_day': 24, 'sin_julian_day': 25, 'cos_local_time': 26, 'sin_local_time': 27, 'cos_sza': 28, 'cos_latitude': 29, 'sin_latitude': 30, 'cos_longitude': 31, 'sin_longitude': 32, 'cos_vza': 33}


34 (34)

Datasets global statistics have been precomputed

In [7]:
ds.statistics['mean']

array([3.8300708e+02, 5.4755575e+06, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 3.3789816e+05, 2.3150133e+02,
       2.1546123e+02, 2.2681549e+02, 2.2386728e+02, 2.1591220e+02,
       2.2223653e+02, 2.2975639e+02, 2.4038684e+02, 2.5628555e+02,
       2.6909610e+02, 2.6933527e+02, 2.6938358e+02, 2.5693881e+02,
       2.5370573e+02, 2.4154004e+02, 2.4708250e+02, 2.6279996e+02,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00], dtype=float32)

In [8]:
ds.statistics['stdev']

array([2.3034642e+02, 3.8735608e+06, 9.0000000e+01, 1.8000000e+02,
       9.0000000e+01, 1.8000000e+02, 5.1043806e+05, 9.6277647e+00,
       8.9814367e+00, 9.7006111e+00, 9.1120253e+00, 8.0488205e+00,
       5.7508364e+00, 6.5173740e+00, 9.6341305e+00, 1.4390362e+01,
       2.1499352e+01, 2.1563625e+01, 2.1560154e+01, 1.3033051e+01,
       1.1873526e+01, 8.5672741e+00, 1.0070569e+01, 1.4734804e+01,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00], dtype=float32)

In [9]:
ds.statistics['minimum']

array([ 0.0000000e+00,  2.4200000e+03, -8.9995308e+01, -1.8000000e+02,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  1.9445790e+02,
        1.8093401e+02,  1.9000102e+02,  1.8855136e+02,  1.8340872e+02,
        1.8487903e+02,  1.8518295e+02,  1.8492542e+02,  1.8350490e+02,
        1.8160480e+02,  1.8158250e+02,  1.8249533e+02,  1.8580470e+02,
        1.8482233e+02,  1.8496103e+02,  1.8495709e+02,  1.6212875e+02,
       -1.0000000e+00, -1.0000000e+00, -1.0000000e+00, -1.0000000e+00,
        0.0000000e+00,  8.1853068e-05, -1.0000000e+00, -1.0000000e+00,
       -1.0000000e+00,  5.0934184e-01], dtype=float32)

In [10]:
ds.statistics['maximum']

array([7.6700000e+02, 2.1695148e+07, 8.9834442e+01, 1.8000000e+02,
       5.9380001e+01, 1.6022000e+02, 5.1310000e+06, 2.9388623e+02,
       2.9386945e+02, 2.9347015e+02, 2.9338644e+02, 2.9335785e+02,
       2.9340256e+02, 2.9337936e+02, 2.9336716e+02, 2.9418332e+02,
       3.3613821e+02, 3.3544128e+02, 3.3491034e+02, 3.0393958e+02,
       2.9816287e+02, 2.9311325e+02, 2.9315689e+02, 3.1640573e+02,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       9.9860054e-01, 1.0000000e+00, 9.9999583e-01, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00], dtype=float32)

In [11]:
# The actual data is available with [i]
for i in range(len(ds))[:5]:
    data = ds[i]
    print(f"✅ Got item {i} : {type(data)} {data.shape}")
    print(f"     associated with the 'validity datetime' {ds.dates[i]}")
    print(f"     contains the data for the time window [{ds.dates[i] - np.timedelta64(ds.frequency)}  -  {ds.dates[i]}]")
    print()

✅ Got item 0 : <class 'numpy.ndarray'> (34, 25075)
     associated with the 'validity datetime' 2007-02-28T00:00:00
     contains the data for the time window [2007-02-27T18:00:00.000000  -  2007-02-28T00:00:00]

✅ Got item 1 : <class 'numpy.ndarray'> (34, 24842)
     associated with the 'validity datetime' 2007-02-28T06:00:00
     contains the data for the time window [2007-02-28T00:00:00.000000  -  2007-02-28T06:00:00]

✅ Got item 2 : <class 'numpy.ndarray'> (34, 24908)
     associated with the 'validity datetime' 2007-02-28T12:00:00
     contains the data for the time window [2007-02-28T06:00:00.000000  -  2007-02-28T12:00:00]

✅ Got item 3 : <class 'numpy.ndarray'> (34, 24925)
     associated with the 'validity datetime' 2007-02-28T18:00:00
     contains the data for the time window [2007-02-28T12:00:00.000000  -  2007-02-28T18:00:00]

✅ Got item 4 : <class 'numpy.ndarray'> (34, 23175)
     associated with the 'validity datetime' 2007-03-01T00:00:00
     contains the data for the t

In [12]:
variable = 'rawbt_249'
i = 100
print(f"For variable {variable}, the mean is {ds.statistics['mean'][ds.name_to_index[variable]]} ")

For variable rawbt_249, the mean is 229.7563934326172 


In [13]:
print(f"For time window #{i}, ending at {ds.dates[i]} (and variable {variable}), the observations data is {ds[i][ds.name_to_index[variable]]}")

For time window #100, ending at 2007-03-25T00:00:00 (and variable rawbt_249), the observations data is [218.99956 218.93092 219.31906 ... 228.26152 227.31386 225.95712]


# More datasets

In [14]:
def str_(t):
    """Not needed, but useful for debugging"""
    import numpy as np
    if isinstance(t, np.ndarray):
        return t.shape
    if isinstance(t, (list, tuple)):
        return "[" + " , ".join(str_(e) for e in t) + "]"
    if isinstance(t, dict):
        return "{" + " , ".join(f"{k}: {str_(v)}" for k, v in t.items()) + "}"
    return str(t)

def show(ds, filter=None):
    print(ds)
    print(ds.tree())
    print(f"✅ Initialized Observations Dataset with {len(ds)} items")
    print(f"Dates: {ds.dates[0]}, {ds.dates[1]}, ..., {ds.dates[-2]}, {ds.dates[-1]}")
    print(f"Frequency: {ds.frequency}")
    print(f"Variables: {ds.variables}")
    print(f"Name to index: {ds.name_to_index}")
    print("Statistics:")
    for k, v in ds.statistics.items():
        print(f"  {k}: {','.join([str(_) for _ in v])}")

    count = 10
    for i in range(len(ds)):
        date = ds.dates[i]
        if filter and not str(date).startswith(filter):
            continue

        data = ds[i]

        count -= 1
        print(f"✅ Got item {i}, associated with the 'validity datetime' {date} : {str_(data)}")
        print(f"✅     item {i} contains the data for the time window [{date - np.timedelta64(ds.frequency)}  -  {date}]")
        
        if count == 0:
            break

In [55]:
DATASETS = [
    "observations-od-ai-0001-2013-2023-amsr2-h180-v2",
    "observations-ea-ofb-0001-1998-2023-noaa-15-amsua-radiances-v1",
    "observations-ea-ofb-0001-2005-2023-noaa-18-amsua-radiances-v1",
    "observations-ea-ofb-0001-2006-2021-metop-a-amsua-radiances-v1",
    "observations-ea-ofb-0001-2012-2023-metop-b-amsua-radiances-v1",
    "observations-ea-ofb-0001-2007-2021-metop-a-iasi-radiances-v1",
    "observations-ea-ofb-0001-2013-2023-metop-b-iasi-radiances-v1",
    "observations-ea-ofb-0001-2019-2023-metop-c-iasi-radiances-v1",
    "observations-ea-ofb-0001-2002-2023-aqua-airs-radiances-v1",
    "observations-ea-ofb-0001-2009-2023-dmsp-17-ssmis-radiances-all-sky-v1",
    "observations-ea-ofb-0001-2012-2023-npp-atms-radiances-v2",
    "observations-ea-ofb-0001-2018-2023-noaa-20-atms-radiances-v1",
    "observations-ea-ofb-0001-2012-2023-npp-cris-radiances-v1",
    "observations-ea-ofb-0001-2014-2023-saral-ralt-wave-v1",
    "observations-od-ec-0001-2007-2012-meteosat-9-seviri-v3",
    "observations-od-ai-0001-2012-2018-meteosat-10-seviri-v3",
    "observations-od-ai-0001-2017-2022-meteosat-8-iodc-seviri-v3",
    "observations-od-ec-0001-2004-2007-meteosat-8-seviri-v3",
    "observations-od-ai-0001-2018-2023-meteosat-11-seviri-v1",
    "observations-ea-ofb-0001-2008-2021-metop-a-gpsro-v2-sort",
    "observations-ea-ofb-0001-2012-2023-metop-b-gpsro-v2-sort",
    "observations-ea-ofb-0001-2018-2023-metop-c-gpsro-v2-sort",
    "observations-ea-ofb-0001-2007-2021-metop-a-ascat-v1",
    "observations-ea-ofb-0001-2013-2023-metop-b-ascat-v1",
    "observations-ea-ofb-0001-2020-2023-metop-c-ascat-v1",
    "observations-ea-ofb-0001-1979-2023-combined-surface-v2",
    "observations-od-ofb-0001-2014-2023-combined-snow-depth-v1",
    "observations-ea-ofb-0001-1979-2023-combined-upper-air-v1",
    "observations-od-ai-0001-2013-2024-nexrad-h220-v1",
]

DATASETS=[]
for name in DATASETS:
    print('------------------------------------')
    ds = open_dataset(name)
    show(ds)