In [1]:
import fsspec
import xarray as xr
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import helpers

# Todos for this notebook

**NOTE:** This work depends on https://github.com/zarr-developers/VirtualiZarr/pull/369

- [ ] Estimate cost
- [ ] Validate all data has correct `_FillValue` in encoding
- [ ] Implement distributed zarr write (see https://icechunk.io/icechunk-python/examples/dask_write)

Nice to have:
- [ ] Add arraylake option

# 1. Start a dask cluster

The dask cluster will help parallelize generating references and in computation for validation.

In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
cluster.scale(8)
client

# 2. Initialize file stores for reading and writing

## 2a. Initialize a filesystem for accessing the MUR SST data files.

In [2]:
fs = fsspec.filesystem("s3", anon=False)

## 2b. Initialize the store we are writing to (icechunk).

**NOTE:** If just appending to the store, `overwrite` should `=False`.

If overwriting an existing s3 store, you need to run the following lines:

<code>
!pip install awscli
!aws s3 rm --recursive s3://nasa-veda-scratch/icechunk/{store_name}
</code>

In [3]:
store = helpers.find_or_create_icechunk_store(store_name="MUR-JPL-L4-GLOB-v4.1-virtual", store_type="s3", overwrite=False)
store

<icechunk.IcechunkStore at 0x7f993673f350>

# 3. Create initial store with data from 2002

## 3a. List, virtualize and concatenize datasets

This step uses the dmrpp reader of VirtualiZarr. This reader makes this process very fast since we don't actually have to open and read any of the original files.

In [None]:
mur_sst_files_2002 = helpers.list_mur_sst_files(start_date="2002-06-01", end_date="2002-12-31")
mur_sst_dmrpps_2002 = [f + '.dmrpp' for f in mur_sst_files_2002]
virtual_ds_2002 = helpers.create_virtual_ds(dmrpps=mur_sst_dmrpps_2002)

In [None]:
# sanity check
len(mur_sst_dmrpps_2002)

## 3b. Write to icechunk

In [None]:
%%time
virtual_ds_2002.virtualize.to_icechunk(store)
store.commit("Wrote 2002 data")

## 3c. Validate

In [None]:
helpers.validate_data(store, dates=["2002-06-01", "2002-12-31"], fs=fs)

# 4. Append 2003

One file in 2003 (2003-09-11) had a different encoding, so the the list of 2003 files is split into 3 lists. All dates apart from the date with the different encoding are written as virtual stores. The problematic data is written as zarr.

See and run `helpers.get_codecs` with a list of virtual datasets to check all codecs are the same.

## 4a. List files from 2003

And split that list by the date with the different encoding.

In [None]:
mur_sst_files_2003_1 = helpers.list_mur_sst_files(start_date="2003-01-01", end_date="2003-09-10")

## 4b. Write first set of files as virtual datasets using the DMRPP reader

In [None]:
mur_sst_files_2003_1_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2003_1]
virtual_ds_2003_1 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2003_1_dmrpps)

In [None]:
virtual_ds_2003_1.virtualize.to_icechunk(store, append_dim='time')
store.commit("Wrote first part of 2003 data")

## 4c. Write data with different encoding as zarr

In [None]:
# this takes about a minute and a lot of memory (nearly 40GB)
problematic_file = helpers.list_mur_sst_files(start_date="2003-09-11", end_date="2003-09-11")
# using chunks={} or chunks='auto' to initialize the dataset with dask arrays fails, throwing an error that the store is in read-only mode.
# I have not investigated this.
ds = xr.open_dataset(fs.open(problematic_file[0]))
ds.to_zarr(store, append_dim='time')
store.commit(f"Wrote {problematic_file} in zarr")

## 4d. Write the rest of 2003 as virtual data

In [None]:
mur_sst_files_2003_2 = helpers.list_mur_sst_files(start_date="2003-09-12", end_date="2003-12-31")
mur_sst_files_2003_2_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2003_2]
virtual_ds_2003_2 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2003_2_dmrpps)

In [None]:
virtual_ds_2003_2.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote to end of 2003.")

## 4e. Validate

In [None]:
helpers.validate_data(store, dates=["2003-01-01", "2003-12-31"], fs=fs)

# 5. Append 2004

## 5a. List files

In [None]:
dates = ['2004-01-01', '2004-12-31']
mur_sst_files_2004 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2004_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2004]

In [None]:
len(mur_sst_files_2004_dmrpps)

## 5b. Write data

In [None]:
virtual_ds_2004 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2004_dmrpps)
virtual_ds_2004.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2004 to store.")

## 5c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 6. Let's try 2 years! 2005-2006

## 6a. List files

In [None]:
dates = ['2005-01-01', '2006-12-31']
mur_sst_files_2005_2006 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2005_2006_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2005_2006]

In [None]:
len(mur_sst_files_2005_2006_dmrpps)

## 6b. Write data

In [None]:
virtual_ds_2005_2006 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2005_2006_dmrpps)
virtual_ds_2005_2006.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2005-2006 to store.")

## 6c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 7. Let's try 5 years! 2007 through end of 2011

## 7a. List files

In [None]:
dates = ['2007-01-01', '2011-12-31']
mur_sst_files_2007_2011 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2007_2011_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2007_2011]

In [None]:
len(mur_sst_files_2007_2011_dmrpps)

## 7b. Write data

In [None]:
virtual_ds_2007_2011 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2007_2011_dmrpps)
virtual_ds_2007_2011.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2007-2011 to store.")

## 7c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 8. 2012

## 8a. List files

In [None]:
dates = ['2012-01-01', '2012-12-31']
mur_sst_files_2012 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2012_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2012]

In [None]:
len(mur_sst_files_2012_dmrpps)

## 8b. Write data

In [None]:
virtual_ds_2012 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2012_dmrpps)

In [None]:
virtual_ds_2012.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2012 to store.")

## 8c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 9. 2013

## 9a. List files

In [None]:
dates = ['2013-01-01', '2013-12-31']
mur_sst_files_2013 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2013_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2013]

In [None]:
len(mur_sst_files_2013_dmrpps)

## 9b. Write data

In [None]:
virtual_ds_2013 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2013_dmrpps)

In [None]:
virtual_ds_2013.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2013 to store.")

## 9c. Validate data

In [None]:
%%time
dates = ['2013-01-01', '2013-12-31']
helpers.validate_data(store, dates=dates, fs=fs)

# 10. 2014

## 10a. List files

In [None]:
dates = ['2014-01-01', '2014-12-31']
mur_sst_files_2014 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2014_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2014]

In [None]:
len(mur_sst_files_2014_dmrpps)

## 10b. Write data

In [None]:
virtual_ds_2014 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2014_dmrpps)

In [None]:
virtual_ds_2014.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2014 to store.")

## 10c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 11. 2015

## 11a. List files

In [None]:
dates = ['2015-01-01', '2015-12-31']
mur_sst_files_2015 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2015_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2015]

In [None]:
len(mur_sst_files_2015_dmrpps)

## 11b. Write data

In [None]:
virtual_ds_2015 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2015_dmrpps)

In [None]:
virtual_ds_2015.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2015 to store.")

## 11c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 12. 2016

## 12a. List files

In [None]:
dates = ['2016-01-01', '2016-12-31']
mur_sst_files_2016 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2016_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2016]

In [None]:
len(mur_sst_files_2016_dmrpps)

## 12b. Write data

In [None]:
virtual_ds_2016 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2016_dmrpps)

In [None]:
virtual_ds_2016.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2016 to store.")

# 13. 2017

## 13a. List files

In [None]:
%%time
dates = ['2017-01-01', '2017-12-31']
mur_sst_files_2017 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2017_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2017]

In [None]:
len(mur_sst_files_2017_dmrpps)

## 13b. Write data

In [None]:
%%time
virtual_ds_2017 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2017_dmrpps)
virtual_ds_2017.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2017 to store.")

## 13c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 14. 2018

## 14a. List files

In [None]:
%%time
dates = ['2018-01-01', '2018-12-31']
mur_sst_files_2018 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2018_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2018]

In [None]:
len(mur_sst_files_2018_dmrpps)

## 14b. Write data

In [None]:
%%time
virtual_ds_2018 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2018_dmrpps)
virtual_ds_2018.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2018 to store.")

## 14c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 15. 2019

## 15a. List files

In [None]:
%%time
dates = ['2019-01-01', '2019-12-31']
mur_sst_files_2019 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2019_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2019]

In [None]:
len(mur_sst_files_2019_dmrpps)

## 15b. Write data

In [None]:
%%time
virtual_ds_2019 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2019_dmrpps)
virtual_ds_2019.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2019 to store.")

## 15c. Validate data

In [None]:
%%time
dates = ['2019-01-01', '2019-12-31']
helpers.validate_data(store, dates=dates, fs=fs)

# 16. 2020

## 16a. List files

In [None]:
%%time
dates = ['2020-01-01', '2020-12-31']
mur_sst_files_2020 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2020_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2020]

In [None]:
len(mur_sst_files_2020_dmrpps)

## 16b. Write data

In [None]:
%%time
virtual_ds_2020 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2020_dmrpps)
virtual_ds_2020.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2020 to store.")

## 16c. Validate data

In [None]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

# 17. 2021

Two files in 2021 (2021-02-20 and 02-21) and then 7 files (2021-12-24 to 2021-12-31) have a different encoding, so the the list of 2020 files is split into 4 parts. All dates apart from the dates with the different encoding are written as virtual stores. The problematic data is written as zarr.

See and run `helpers.get_codecs` with a list of virtual datasets to check all codecs are the same.

```python
from virtualizarr import open_virtual_dataset
vdss = [open_virtual_dataset(f, indexes={}, filetype='dmrpp') for f in mur_sst_files_dmrpps]
helpers.check_codecs(vdss)
```

## 17a. List files from first period

In [None]:
%%time
dates = ['2021-01-01', '2021-02-19']
mur_sst_files_2021_1 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2021_1_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2021_1]

In [None]:
len(mur_sst_files_2021_1_dmrpps)

## 17b. Write data

In [None]:
%%time
virtual_ds_2021_1 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2021_1_dmrpps)
virtual_ds_2021_1.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2021-01-01-2021-02-19 to store.")

## 17c. Write 2 special days as zarr

In [None]:
# this takes about a minute and a lot of memory (nearly 40GB)
problematic_files = helpers.list_mur_sst_files(start_date="2021-02-20", end_date="2021-02-21")
ds = xr.open_mfdataset(
    [fs.open(f) for f in problematic_files],
    drop_variables=['dt_1km_data', 'sst_anomaly']
)

ds['analysed_sst'] = ds['analysed_sst'].chunk({'time': 1, 'lat': 1023, 'lon': 2047})
ds['analysis_error'] = ds['analysis_error'].chunk({'time': 1, 'lat': 1023, 'lon': 2047})
ds['mask'] = ds['mask'].chunk({'time': 1, 'lat': 1447, 'lon': 2895})
ds['sea_ice_fraction'] = ds['sea_ice_fraction'].chunk({'time': 1, 'lat': 1447, 'lon': 2895})

In [None]:
ds

In [None]:
ds.to_zarr(store, append_dim='time')
store.commit(f"Wrote 2021-02-20 to 02-21 in zarr")

## 17d. List files for 2021-02-22 to 2021-12-23

In [None]:
%%time
dates = ['2021-02-22', '2021-12-23']
mur_sst_files_2021_2 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2021_2_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2021_2]

In [None]:
len(mur_sst_files_2021_2_dmrpps)

## 17e. Write data

In [None]:
%%time
virtual_ds_2021_2 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2021_2_dmrpps)
virtual_ds_2021_2.virtualize.to_icechunk(store, append_dim='time')
store.commit(f"Wrote 2021-02-22 to 2021-12-23 to store.")

## 17f. Write the rest of the days as Zarr

In [None]:
problematic_files = helpers.list_mur_sst_files(start_date="2021-12-24", end_date="2021-12-31")
ds = xr.open_mfdataset(
    [fs.open(f) for f in problematic_files],
    drop_variables=['dt_1km_data', 'sst_anomaly']
)

ds['analysed_sst'] = ds['analysed_sst'].chunk({'time': 1, 'lat': 1023, 'lon': 2047})
ds['analysis_error'] = ds['analysis_error'].chunk({'time': 1, 'lat': 1023, 'lon': 2047})
ds['mask'] = ds['mask'].chunk({'time': 1, 'lat': 1447, 'lon': 2895})
ds['sea_ice_fraction'] = ds['sea_ice_fraction'].chunk({'time': 1, 'lat': 1447, 'lon': 2895})

In [None]:
ds

In [None]:
# %%time
# # ds.to_zarr(store, append_dim='time')
# # store.commit(f"Wrote 2021-12-24 to 2021-12-31 in zarr")

## 17*. Validate data

In [None]:
%%time
dates = ['2021-01-01', '2021-12-31']
helpers.validate_data(store, dates=dates, fs=fs)

# Post-script: Checking the store and how long it takes to open it.

In [4]:
%%time
xds = xr.open_zarr(store, consolidated=False)

CPU times: user 24.1 s, sys: 7.69 s, total: 31.8 s
Wall time: 2min 19s


In [6]:
xds

Unnamed: 0,Array,Chunk
Bytes,33.70 TiB,31.96 MiB
Shape,"(7148, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1208012 chunks in 2 graph layers,1208012 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 33.70 TiB 31.96 MiB Shape (7148, 17999, 36000) (1, 1447, 2895) Dask graph 1208012 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  7148,

Unnamed: 0,Array,Chunk
Bytes,33.70 TiB,31.96 MiB
Shape,"(7148, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1208012 chunks in 2 graph layers,1208012 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,33.70 TiB,15.98 MiB
Shape,"(7148, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2315952 chunks in 2 graph layers,2315952 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 33.70 TiB 15.98 MiB Shape (7148, 17999, 36000) (1, 1023, 2047) Dask graph 2315952 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  7148,

Unnamed: 0,Array,Chunk
Bytes,33.70 TiB,15.98 MiB
Shape,"(7148, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2315952 chunks in 2 graph layers,2315952 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.85 TiB,15.98 MiB
Shape,"(7148, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1208012 chunks in 2 graph layers,1208012 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 16.85 TiB 15.98 MiB Shape (7148, 17999, 36000) (1, 1447, 2895) Dask graph 1208012 chunks in 2 graph layers Data type float32 numpy.ndarray",36000  17999  7148,

Unnamed: 0,Array,Chunk
Bytes,16.85 TiB,15.98 MiB
Shape,"(7148, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1208012 chunks in 2 graph layers,1208012 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,33.70 TiB,15.98 MiB
Shape,"(7148, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2315952 chunks in 2 graph layers,2315952 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 33.70 TiB 15.98 MiB Shape (7148, 17999, 36000) (1, 1023, 2047) Dask graph 2315952 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  7148,

Unnamed: 0,Array,Chunk
Bytes,33.70 TiB,15.98 MiB
Shape,"(7148, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2315952 chunks in 2 graph layers,2315952 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [17]:
len(xds.mask.chunks[1])

13