Xarray provides two data structures: the DataArray and Dataset

The Core Data Structures
- DataArray: This is the building block. It wraps a multi-dimensional array
- .data: The raw numerical data.
- .dims: Named dimensions (e.g., 'time', 'lat', 'lon').
- .coords: Coordinates that label each point along a dimension (e.g., the specific dates for the 'time' dimension).
- .attrs: A dictionary for metadata, like units ('degK') or a description of the data.

In [1]:
import numpy as np
import xarray as xr
import pandas as pd

In [2]:
# dataset : dictionary like containers of DataArrays
ds = xr.tutorial.load_dataset("air_temperature")
ds

In [3]:
ds['air']

In [4]:
# html : notebooks, text. 
with xr.set_options(display_style = "html"):
    display(ds)

In [5]:
with xr.set_options(display_style="text"):
    display(ds)

In [6]:
# DataArray : consist of an array and its associated dimension names, labels, and attributes

da = ds["air"]
da


In [7]:
# string representations
with xr.set_options(display_style="html"):
    display(da)

In [8]:
with xr.set_options(display_style="text"):
    display(da)

In [9]:
ds["air"].data

array([[[241.2 , 242.5 , 243.5 , ..., 232.8 , 235.5 , 238.6 ],
        [243.8 , 244.5 , 244.7 , ..., 232.8 , 235.3 , 239.3 ],
        [250.  , 249.8 , 248.89, ..., 233.2 , 236.39, 241.7 ],
        ...,
        [296.6 , 296.2 , 296.4 , ..., 295.4 , 295.1 , 294.7 ],
        [295.9 , 296.2 , 296.79, ..., 295.9 , 295.9 , 295.2 ],
        [296.29, 296.79, 297.1 , ..., 296.9 , 296.79, 296.6 ]],

       [[242.1 , 242.7 , 243.1 , ..., 232.  , 233.6 , 235.8 ],
        [243.6 , 244.1 , 244.2 , ..., 231.  , 232.5 , 235.7 ],
        [253.2 , 252.89, 252.1 , ..., 230.8 , 233.39, 238.5 ],
        ...,
        [296.4 , 295.9 , 296.2 , ..., 295.4 , 295.1 , 294.79],
        [296.2 , 296.7 , 296.79, ..., 295.6 , 295.5 , 295.1 ],
        [296.29, 297.2 , 297.4 , ..., 296.4 , 296.4 , 296.6 ]],

       [[242.3 , 242.2 , 242.3 , ..., 234.3 , 236.1 , 238.7 ],
        [244.6 , 244.39, 244.  , ..., 230.3 , 232.  , 235.7 ],
        [256.2 , 255.5 , 254.2 , ..., 231.2 , 233.2 , 238.2 ],
        ...,
        [295

In [10]:
# Named dimensions
# .dims : named axes of my data. we have 2 spatial dim and 1 temporal dim
da.dims

('time', 'lat', 'lon')

In [11]:
# coordinates
# .coords : dict-like data container
da.coords

Coordinates:
  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0
  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0
  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00

In [12]:
# Attributes
# .attrs : dictionary that contain arbitrary python object.
da.attrs

{'long_name': '4xDaily Air temperature at sigma level 995',
 'units': 'degK',
 'precision': np.int16(2),
 'GRIB_id': np.int16(11),
 'GRIB_name': 'TMP',
 'var_desc': 'Air temperature',
 'dataset': 'NMC Reanalysis',
 'level_desc': 'Surface',
 'statistic': 'Individual Obs',
 'parent_stat': 'Other',
 'actual_range': array([185.16, 322.1 ], dtype=float32)}

In [13]:
# dataarray and dataset objects are frequently created by converting from other libraries such as pandas / NetCDF or zarr
# To conver from/to pandas, we can use the to_xarray methods or to_pandas

series = pd.Series(np.ones((10,)), index = list("abcdefghij"))
series

a    1.0
b    1.0
c    1.0
d    1.0
e    1.0
f    1.0
g    1.0
h    1.0
i    1.0
j    1.0
dtype: float64

In [14]:
arr = series.to_xarray()
print(arr)

<xarray.DataArray (index: 10)> Size: 80B
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Coordinates:
  * index    (index) object 80B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j'


In [15]:
arr.to_pandas()

index
a    1.0
b    1.0
c    1.0
d    1.0
e    1.0
f    1.0
g    1.0
h    1.0
i    1.0
j    1.0
dtype: float64

In [16]:
# to_series : convert DataArray objects to pandas Series
da.to_series()

time                 lat   lon  
2013-01-01 00:00:00  75.0  200.0    241.20
                           202.5    242.50
                           205.0    243.50
                           207.5    244.00
                           210.0    244.10
                                     ...  
2014-12-31 18:00:00  15.0  320.0    297.39
                           322.5    297.19
                           325.0    296.49
                           327.5    296.19
                           330.0    295.69
Name: air, Length: 3869000, dtype: float64

In [17]:
# to dataframe
da.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,air
time,lat,lon,Unnamed: 3_level_1
2013-01-01 00:00:00,75.0,200.0,241.20
2013-01-01 00:00:00,75.0,202.5,242.50
2013-01-01 00:00:00,75.0,205.0,243.50
2013-01-01 00:00:00,75.0,207.5,244.00
2013-01-01 00:00:00,75.0,210.0,244.10
...,...,...,...
2014-12-31 18:00:00,15.0,320.0,297.39
2014-12-31 18:00:00,15.0,322.5,297.19
2014-12-31 18:00:00,15.0,325.0,296.49
2014-12-31 18:00:00,15.0,327.5,296.19


In [18]:
rng = np.random.default_rng(seed=0)  # we'll use this later

In [19]:
array = da.data
array

array([[[241.2 , 242.5 , 243.5 , ..., 232.8 , 235.5 , 238.6 ],
        [243.8 , 244.5 , 244.7 , ..., 232.8 , 235.3 , 239.3 ],
        [250.  , 249.8 , 248.89, ..., 233.2 , 236.39, 241.7 ],
        ...,
        [296.6 , 296.2 , 296.4 , ..., 295.4 , 295.1 , 294.7 ],
        [295.9 , 296.2 , 296.79, ..., 295.9 , 295.9 , 295.2 ],
        [296.29, 296.79, 297.1 , ..., 296.9 , 296.79, 296.6 ]],

       [[242.1 , 242.7 , 243.1 , ..., 232.  , 233.6 , 235.8 ],
        [243.6 , 244.1 , 244.2 , ..., 231.  , 232.5 , 235.7 ],
        [253.2 , 252.89, 252.1 , ..., 230.8 , 233.39, 238.5 ],
        ...,
        [296.4 , 295.9 , 296.2 , ..., 295.4 , 295.1 , 294.79],
        [296.2 , 296.7 , 296.79, ..., 295.6 , 295.5 , 295.1 ],
        [296.29, 297.2 , 297.4 , ..., 296.4 , 296.4 , 296.6 ]],

       [[242.3 , 242.2 , 242.3 , ..., 234.3 , 236.1 , 238.7 ],
        [244.6 , 244.39, 244.  , ..., 230.3 , 232.  , 235.7 ],
        [256.2 , 255.5 , 254.2 , ..., 231.2 , 233.2 , 238.2 ],
        ...,
        [295

In [20]:
# we do this using the DataArray constructor
xr.DataArray(array)

In [21]:
# 추천 X

lon_values = np.arange(200, 331, 2.5)

xr.DataArray(array, dims = ("time", "lat", "lon"), coords = {"lon": lon_values})

In [22]:
# 추천

#lat_2d = xr.DataArray(array, dims=('y', 'x'))
#lon_2d = xr.DataArray(array, dims=('y', 'x'))
#lon_da = xr.DataArray(lon_values, dims = "lon")

#da_1 = xr.DataArray(array,
                    #dims=('y', 'x'),
                    #coords={'latitude': lat_2d, 'longitude': lon_2d})

lon_values = np.arange(200, 331, 2.5)                    
lon_da = xr.DataArray(lon_values, dims = "lon")
da = xr.DataArray(array, dims = ("time", "lat", "lon"), coords = {"lon": lon_da})
print(da)

<xarray.DataArray (time: 2920, lat: 25, lon: 53)> Size: 31MB
array([[[241.2 , 242.5 , 243.5 , ..., 232.8 , 235.5 , 238.6 ],
        [243.8 , 244.5 , 244.7 , ..., 232.8 , 235.3 , 239.3 ],
        [250.  , 249.8 , 248.89, ..., 233.2 , 236.39, 241.7 ],
        ...,
        [296.6 , 296.2 , 296.4 , ..., 295.4 , 295.1 , 294.7 ],
        [295.9 , 296.2 , 296.79, ..., 295.9 , 295.9 , 295.2 ],
        [296.29, 296.79, 297.1 , ..., 296.9 , 296.79, 296.6 ]],

       [[242.1 , 242.7 , 243.1 , ..., 232.  , 233.6 , 235.8 ],
        [243.6 , 244.1 , 244.2 , ..., 231.  , 232.5 , 235.7 ],
        [253.2 , 252.89, 252.1 , ..., 230.8 , 233.39, 238.5 ],
        ...,
        [296.4 , 295.9 , 296.2 , ..., 295.4 , 295.1 , 294.79],
        [296.2 , 296.7 , 296.79, ..., 295.6 , 295.5 , 295.1 ],
        [296.29, 297.2 , 297.4 , ..., 296.4 , 296.4 , 296.6 ]],

       [[242.3 , 242.2 , 242.3 , ..., 234.3 , 236.1 , 238.7 ],
        [244.6 , 244.39, 244.  , ..., 230.3 , 232.  , 235.7 ],
        [256.2 , 255.5 , 25

In [23]:
da.coords["lat"] = np.arange(75, 14.9, -2.5)
da

In [24]:
# Attributes .attrs
da.attrs["attribute"] = "hello"
da

In [25]:
da2 = xr.DataArray(
    array, dims = ("time", "lat", "lon"),
    coords = {"lon":lon_da},
    attrs = {"attribute": "wonseok"}
)

In [26]:
da2

In [27]:
# Non-dimension coordinates : Sometimes we want to attach coordinate variables along an existing dimension
# itime : not bold and has a name "itime" that is different from the dimension name "time".

da.coords["itime"] = ("time", np.arange(2920), {"name": "value"})
da

In [28]:
xr.DataArray(rng.random((180, 360)) * 400,
             dims = ("latitude", "longitude"),
             name = "height")

In [29]:
xr.DataArray(
    rng.random((180, 360))*400,
    dims = ("latitude", "longitude"),
    coords = {"latitude" : np.arange(-90, 90, 1),
              "longitude" : np.arange(-180, 180, 1)}
)

In [30]:
xr.DataArray(
    # 1. 데이터: 180x360 크기의 0~400 사이 랜덤 숫자
    rng.random((180, 360))*400,
    # 2. 차원 이름: 각 축에 'latitude', 'longitude' 이름 붙이기
    dims = ("latitude", "longitude"),
    # 3. 좌표: 각 차원의 축에 실제 좌표 값과 정보(속성) 할당
    coords = {
        "latitude" : ("latitude", np.arange(-90, 90, 1), {"type" : "geodetic"}),
        "longitude" : ("longitude", np.arange(-180, 180, 1), {"prime_meridian":"greenwich"})
    },
    # 4. 데이터 전체의 속성: 이 데이터가 'ellipsoid' 타입임을 명시
    attrs = {"type" : "ellipsoid"},
    # 5. 데이터 이름: 이 DataArray의 변수명을 'height'로 지정
    name = "height"
)

In [31]:
# Dataset : collect multiple data variables.
# - data_vars : dict - like mapping names to values,
# - DataArray : objects or defined with tuples
# - coords : 
# - attrs :
xr.Dataset()

ds = xr.Dataset({"air" : da, "air2" : da2})
ds["air3"] = da
ds

In [32]:
xr.Dataset(
    {"air" : da, "air2" : da2},
    coords = {"time" : pd.date_range("2013-01-01", "2014-12-31 18:00", freq = "6H")}
)

  coords = {"time" : pd.date_range("2013-01-01", "2014-12-31 18:00", freq = "6H")}


In [33]:
ds.coords["time"] = pd.date_range("2013-01-01", "2014-12-31 18:00", freq = "6H")
ds

  ds.coords["time"] = pd.date_range("2013-01-01", "2014-12-31 18:00", freq = "6H")


In [34]:
# Attributes
xr.Dataset(
    {"air" : da, "air2" : da2},
    coords = {"time": pd.date_range("2013-01-01", "2014-12-31 18:00", freq="6H")},
    attrs = {"key0" : "value0"}
)

  coords = {"time": pd.date_range("2013-01-01", "2014-12-31 18:00", freq="6H")},


In [35]:
ds.attrs["key"] = "value"

In [36]:
ds

### Exercises

1. create a Dataset with two variables along `latitude` and `longitude`:
   `height` and `gravity_anomaly`

In [37]:
height = rng.random((180, 360)) * 400
gravity_anomaly = rng.random((180, 360)) * 400 - 200

xr.Dataset(
    {"height" : (("latitude", "longitude"), height),
     "gravity_anomaly": (("latitude", "longitude"), gravity_anomaly)}
)

2. add coordinates to `latitude` and `longitude`:

- `latitude`: from -90 to 90 with step size 1
- `longitude`: from -180 to 180 with step size 1

In [38]:
xr.Dataset(
    {"height" : (("latitude", "longitude"), height),
     "gravity_anomaly": (("latitude", "longitude"), gravity_anomaly)},
    coords = {
        "latitude" : ("latitude", np.arange(-90, 90, 1)),
        "longitude" : ("longitude", np.arange(-180, 180, 1))
    }   
)

3. add metadata to coordinates and variables:

- `latitude`: "type": "geodetic"
- `longitude`: "prime_meridian": "greenwich"
- `height`: "ellipsoid": "wgs84"
- `gravity_anomaly`: "ellipsoid": "grs80"

In [39]:
xr.Dataset(
    {"height" : (("latitude", "longitude"), height),
     "gravity_anomaly": (("latitude", "longitude"), gravity_anomaly)},
    coords = {
        "latitude" : ("latitude", np.arange(-90, 90, 1), 
                      {"type": "geodetic"}),
        "longitude" : ("longitude", np.arange(-180, 180, 1),
                       {"prime_meridian" : "greenwich"})
    }   
)

In [40]:
import os
print(os.getcwd())

c:\Users\dnjst\Documents\GitHub\python for asml\xarray\mycode


In [41]:
# reading and writing files by "open_dataset and open_mfdataset(NetCDF and Zarr)"

# Xarray reads and writes to NetCDF files using the open_dataset and open_dataarray functions and the to_netcdf method.

import numpy as np
import xarray as xr

np.random.seed(0)

import pathlib
import shutil

datadir = pathlib.Path('../tutorial/data/io-tutorial')
if datadir.exists():
    shutil.rmtree(datadir)
else:
    datadir.mkdir()

The constructor of `Dataset` takes three parameters:

- `data_vars`: dict-like mapping names to values. Values are either `DataArray` objects
  or defined with tuples consisting of of dimension names and arrays.
- `coords`: same as for `DataArray`
- `attrs`: same as for `DataArray`

In [42]:
datadir2 = pathlib.Path('../tutorial/data')
ds1 = xr.Dataset(
    data_vars={
        "a" : (("x", "y"), np.random.randn(4, 2)),
        "b" : (("z", "x"), np.random.randn(6, 4))},
    coords = {
        "x" : np.arange(4),
        "y" : np.arange(-2, 0),
        "z" : np.arange(-3, 3)
    }
)
ds1.to_netcdf(datadir2/"ds1.nc")


ds2 = xr.Dataset(
    data_vars={
        "a" : (("x", "y"), np.random.randn(7, 3)),
        "b" : (("z", "x"), np.random.randn(2, 7))},
    coords = {
        "x" : np.arange(6, 13),
        "y" : np.arange(3),
        "z" : np.arange(3, 5)
    }
)
ds2.to_netcdf(datadir2/"ds2.nc")

ds1.a.to_netcdf(datadir2/"da1.nc")

In [43]:
xr.open_dataset(datadir2/"ds1.nc")

In [44]:
xr.open_dataarray(datadir2/"da1.nc")

In [45]:
#zarr : data format providing an implementation of chunked, compressed, N-dimensional arrays. 
ds1.to_zarr(datadir2 / "ds1.zarr", mode = "w")

<xarray.backends.zarr.ZarrStore at 0x19aa7e60a40>

In [46]:
# or we can read the created file with..
xr.open_zarr(datadir2/"ds1.zarr", chunks=None)

In [47]:
# Mutable Mapping interface
mystore = {}
ds1.to_zarr(store=mystore)
mystore

{'.zgroup': b'{\n    "zarr_format": 2\n}',
 '.zattrs': b'{}',
 'z/.zarray': b'{\n    "chunks": [\n        6\n    ],\n    "compressor": {\n        "blocksize": 0,\n        "clevel": 5,\n        "cname": "lz4",\n        "id": "blosc",\n        "shuffle": 1\n    },\n    "dtype": "<i8",\n    "fill_value": null,\n    "filters": null,\n    "order": "C",\n    "shape": [\n        6\n    ],\n    "zarr_format": 2\n}',
 'z/.zattrs': b'{\n    "_ARRAY_DIMENSIONS": [\n        "z"\n    ]\n}',
 'z/0': b'\x02\x013\x080\x00\x00\x000\x00\x00\x00@\x00\x00\x00\xfd\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00',
 'a/.zarray': b'{\n    "chunks": [\n        4,\n        2\n    ],\n    "compressor": {\n        "blocksize": 0,\n        "clevel": 5,\n        "cname": "lz4",\n        "id": "blosc",\n        "shuffle": 1\n    },\n    "dtype": "<f8",\n    "fill_value": "NaN",\

In [48]:
da = xr.DataArray(
    data=ds1.a.data,
    coords={
        "y": np.linspace(47.5, 47.8, 4),
        "x": np.linspace(-122.9, -122.7, 2),
    },
)

# Add Geospatial Coordinate Reference https://epsg.io/4326
# this is stored as a 'spatial_ref' coordinate
da.rio.write_crs("epsg:4326", inplace=True)
da

In [49]:
# Let's open up a precipation dataset.

precipitation = xr.tutorial.open_datatree('precipitation.nc4')

In [50]:
# nodes : groups in a NetCDF4 or HDF5 file in the DataTree model are represented as "nodes" in the DataTree model. We can list all of groups with .groups

precipitation.groups

('/', '/observed', '/reanalysis')

In [51]:
# Accessing variables in a nested groups.
precipitation["observed"]

In [52]:
precipitation["/observed/precipitation"]

In [54]:
precipitation["reanalysis"]["precipitation"]