# Minimal IO tests

These tests illustrate the IO libs we're planning to use the GEOMAR challenge. They are not representative of the whole approach we may end up with.

In [None]:
# parameters
N = 1_000_000  # number of positions (x, y)

## Spin up a local Dask cluster

In [None]:
from dask.distributed import Client

client = Client(n_workers=1, threads_per_worker=2, memory_limit=2e9)
client

## Generate random positions

In [None]:
import numpy as np

In [None]:
random_x = np.random.normal(size=(N, ))
random_y = np.random.normal(size=(N, ))

## Xarray dataset

In [None]:
import xarray as xr

In [None]:
positions = xr.Dataset()

positions["x"] = xr.DataArray(
    random_x, dims=("step", )
)

positions["y"] = xr.DataArray(
    random_y, dims=("step", )
)

display(positions)

In [None]:
positions.plot.scatter(x="x", y="y", alpha=0.1);

### Write to local netCDF file

Write netCDF file and ensure equality of data after reading back into memory.

In [None]:
!mkdir -p test_data/

In [None]:
positions.to_netcdf("test_data/positions.nc")

In [None]:
positions_reread = xr.open_dataset("test_data/positions.nc").load()
display(positions_reread)

In [None]:
assert abs(positions.x - positions_reread.x).max() == 0.0, "x values don't match"
assert abs(positions.y - positions_reread.y).max() == 0.0, "y values don't match"

### Write to local Zarr store

Write Zarr store, reread and ensure equality of data.

_**Note** that we'll have to chunk the data._

In [None]:
positions_chunked = positions.chunk({"step": int(N / 100)})
positions_chunked

In [None]:
positions_chunked.to_zarr("test_data/positions.zarr/", mode="w")

In [None]:
positions_chunked_reread = xr.open_zarr("test_data/positions.zarr/")
display(positions_chunked_reread)

In [None]:
assert abs(positions_chunked.x - positions_chunked_reread.x).max().compute() == 0.0, "x values don't match"
assert abs(positions_chunked.y - positions_chunked_reread.y).max().compute() == 0.0, "y values don't match"

## Dask Dataframe

In [None]:
from dask import dataframe as ddf
from dask import array as darr

In [None]:
positions_ddf = ddf.from_array(darr.asarray(random_x, chunks=int(N / 100))).to_frame(name="x")
positions_ddf["y"] = ddf.from_array(darr.asarray(random_y, chunks=int(N / 100)))
positions_ddf

In [None]:
# cast to pandas dataframe and plot
positions_ddf.compute().plot.scatter(x="x", y="y", alpha=0.1);

### Write to parquet

In [None]:
positions_ddf.to_parquet("test_data/positions.pq")

In [None]:
positions_ddf_reread = ddf.read_parquet("test_data/positions.pq")
display(positions_ddf_reread)

In [None]:
assert abs(positions_ddf["x"] - positions_ddf_reread["x"]).max().compute() == 0.0, "x values don't match"
assert abs(positions_ddf["y"] - positions_ddf_reread["y"]).max().compute() == 0.0, "y values don't match"