In [1]:
import fsspec
import xarray as xr
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import dask_write_zarr as dwz
import helpers

In [2]:
#!pip install git+https://github.com/zarr-developers/VirtualiZarr.git@ab/upgrade-icechunk#egg=VirtualiZarr[icechunk]
!pip list | grep -E '^(virtualizarr|icechunk|zarr|xarray)\s'

icechunk                  0.1.0a12
virtualizarr              1.2.0
xarray                    2025.1.1
zarr                      3.0.0


# Todos for this notebook

- [x] Use dask for distributed writing of zarr (https://icechunk.io/icechunk-python/examples/dask_write/)
- [ ] Add information on the datasets with different encodings
- [ ] Estimate total time to create 2004-2024 and associated cost
- [ ] Remove old stores (local and s3)

# 1. Start a dask cluster

The dask cluster will help parallelize generating references and in computation for validation.

In [3]:
from dask.distributed import Client
# for zarr
client = Client(n_workers=4, threads_per_worker=1)
#client = Client(n_workers=8)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /user/abarciauskas-bgse/proxy/8787/status,

0,1
Dashboard: /user/abarciauskas-bgse/proxy/8787/status,Workers: 4
Total threads: 4,Total memory: 60.62 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41479,Workers: 4
Dashboard: /user/abarciauskas-bgse/proxy/8787/status,Total threads: 4
Started: Just now,Total memory: 60.62 GiB

0,1
Comm: tcp://127.0.0.1:40893,Total threads: 1
Dashboard: /user/abarciauskas-bgse/proxy/37679/status,Memory: 15.16 GiB
Nanny: tcp://127.0.0.1:38949,
Local directory: /tmp/dask-scratch-space/worker-t6frc7o3,Local directory: /tmp/dask-scratch-space/worker-t6frc7o3

0,1
Comm: tcp://127.0.0.1:37685,Total threads: 1
Dashboard: /user/abarciauskas-bgse/proxy/43061/status,Memory: 15.16 GiB
Nanny: tcp://127.0.0.1:34609,
Local directory: /tmp/dask-scratch-space/worker-ydpze_fj,Local directory: /tmp/dask-scratch-space/worker-ydpze_fj

0,1
Comm: tcp://127.0.0.1:42951,Total threads: 1
Dashboard: /user/abarciauskas-bgse/proxy/43411/status,Memory: 15.16 GiB
Nanny: tcp://127.0.0.1:32851,
Local directory: /tmp/dask-scratch-space/worker-klv7vval,Local directory: /tmp/dask-scratch-space/worker-klv7vval

0,1
Comm: tcp://127.0.0.1:40693,Total threads: 1
Dashboard: /user/abarciauskas-bgse/proxy/38917/status,Memory: 15.16 GiB
Nanny: tcp://127.0.0.1:35903,
Local directory: /tmp/dask-scratch-space/worker-3kvngwbo,Local directory: /tmp/dask-scratch-space/worker-3kvngwbo


In [4]:
#client.shutdown()

# 2. Initialize file stores for reading and writing

## 2a. Initialize a filesystem for accessing the MUR SST data files.

In [4]:
fs = fsspec.filesystem("s3", anon=False)

## 2b. Initialize the store we are writing to (icechunk).

**NOTE:** If just appending to the store, `overwrite` should `=False`.

If overwriting an existing s3 store, you need to run the following lines:

<code>
!pip install awscli
!aws s3 rm --recursive s3://nasa-veda-scratch/icechunk/{store_name}
</code>

In [5]:
repo = helpers.find_or_create_icechunk_repo(
    store_name="MUR-JPL-L4-GLOB-v4.1-virtual-v3",
    store_type="s3",
    overwrite=False
)

# Check the current state of the store

## Verify the store by opening it with xarray

Note how long it takes to open as well.

In [6]:
%%time
import xarray as xr
session = repo.readonly_session(branch="main")
xds = xr.open_zarr(session.store, consolidated=False)

CPU times: user 13.4 s, sys: 3.18 s, total: 16.6 s
Wall time: 1min 59s


In [7]:
xds

Unnamed: 0,Array,Chunk
Bytes,35.45 TiB,31.96 MiB
Shape,"(7520, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1270880 chunks in 2 graph layers,1270880 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 35.45 TiB 31.96 MiB Shape (7520, 17999, 36000) (1, 1447, 2895) Dask graph 1270880 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  7520,

Unnamed: 0,Array,Chunk
Bytes,35.45 TiB,31.96 MiB
Shape,"(7520, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1270880 chunks in 2 graph layers,1270880 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.45 TiB,15.98 MiB
Shape,"(7520, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2436480 chunks in 2 graph layers,2436480 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 35.45 TiB 15.98 MiB Shape (7520, 17999, 36000) (1, 1023, 2047) Dask graph 2436480 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  7520,

Unnamed: 0,Array,Chunk
Bytes,35.45 TiB,15.98 MiB
Shape,"(7520, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2436480 chunks in 2 graph layers,2436480 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.45 TiB,15.98 MiB
Shape,"(7520, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2436480 chunks in 2 graph layers,2436480 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 35.45 TiB 15.98 MiB Shape (7520, 17999, 36000) (1, 1023, 2047) Dask graph 2436480 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  7520,

Unnamed: 0,Array,Chunk
Bytes,35.45 TiB,15.98 MiB
Shape,"(7520, 17999, 36000)","(1, 1023, 2047)"
Dask graph,2436480 chunks in 2 graph layers,2436480 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.73 TiB,15.98 MiB
Shape,"(7520, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1270880 chunks in 2 graph layers,1270880 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 17.73 TiB 15.98 MiB Shape (7520, 17999, 36000) (1, 1447, 2895) Dask graph 1270880 chunks in 2 graph layers Data type float32 numpy.ndarray",36000  17999  7520,

Unnamed: 0,Array,Chunk
Bytes,17.73 TiB,15.98 MiB
Shape,"(7520, 17999, 36000)","(1, 1447, 2895)"
Dask graph,1270880 chunks in 2 graph layers,1270880 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Check the snapshots

In [7]:
for snapshot in repo.ancestry(branch="main"):
    print(f"{snapshot.message}, snapshot_id: {snapshot.id}")

Distributed commit for 2022-01-25 09:00 to 2022-01-26 09:00, snapshot_id: RWMD9RB1B05PR1291MVG
Distributed commit for 2022-01-23 09:00 to 2022-01-24 09:00, snapshot_id: M3GNE9PR923J1134ZH0G
Distributed commit for 2022-01-21 09:00 to 2022-01-22 09:00, snapshot_id: 70DBK3R75W94MDJNQF5G
Distributed commit for 2022-01-19 09:00 to 2022-01-20 09:00, snapshot_id: 47Q5PJWNGJ2HW7C4PA90
Distributed commit for 2022-01-17 09:00 to 2022-01-18 09:00, snapshot_id: NZ844D86CQJY5YRGSWSG
Distributed commit for 2022-01-15 09:00 to 2022-01-16 09:00, snapshot_id: W4SFVBT7YY8295F1J1A0
Distributed commit for 2022-01-13 09:00 to 2022-01-14 09:00, snapshot_id: 4JSBPANNT7DQE56CM0FG
Distributed commit for 2022-01-11 09:00 to 2022-01-12 09:00, snapshot_id: QNSJZ8F2GZYG774NF4GG
Distributed commit for 2022-01-09 09:00 to 2022-01-10 09:00, snapshot_id: 41D11AMF6WWHGQ0HVGH0
Distributed commit for 2022-01-07 09:00 to 2022-01-08 09:00, snapshot_id: DZKG4F3C1NQ9XKQXP800
Distributed commit for 2022-01-05 09:00 to 2022-01

## Other useful tips

### Reseting the repo to a previous snapshot

In [23]:
repo.reset_branch(branch="main", snapshot_id='PHXYBRNBJ7T70SS6DSVG')

## Inspecting a store with async generators

In [None]:
# async def fn():
#     return [item async for item in store.list_dir('/')]

# await fn()

# 3. Create initial store with data from 2002

## 3a. List, virtualize and concatenize datasets

This step uses the dmrpp reader of VirtualiZarr. This reader makes this process very fast since we don't actually have to open and read any of the original files.

In [None]:
mur_sst_files_2002 = helpers.list_mur_sst_files(start_date="2002-06-01", end_date="2002-12-31")
mur_sst_dmrpps_2002 = [f + '.dmrpp' for f in mur_sst_files_2002]
virtual_ds_2002 = helpers.create_virtual_ds(dmrpps=mur_sst_dmrpps_2002)

In [None]:
# sanity check
len(mur_sst_dmrpps_2002)

## 3b. Write to icechunk

In [None]:
%%time
virtual_ds_2002.virtualize.to_icechunk(store)
session.commit("Wrote 2002 data")

## 3c. Validate

In [None]:
helpers.validate_data(store, dates=["2002-06-01", "2002-12-31"], fs=fs)

# 4. 2003

One file in 2003 (2003-09-11) had a different encoding, so the the list of 2003 files is split into 3 lists. All dates apart from the date with the different encoding are written as virtual stores. The problematic data is written as zarr.

See and run `helpers.get_codecs` with a list of virtual datasets to check all codecs are the same.

## 4a. Discover files with different codecs

In [30]:
mur_sst_files_2003 = helpers.list_mur_sst_files(start_date="2003-01-01", end_date="2003-12-31")
mur_sst_files_2003_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2003]
vdss = [helpers.open_virtual(f) for f in mur_sst_files_2003_dmrpps]
helpers.check_codecs(vdss)

Codec(compressor=None, filters=[{'id': 'zlib', 'level': 6}])
s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20030911090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc



**Encoding report:**
There's just one different encoding this year: 2003-09-11, which doesn't have the expected shuffle operation. We will write that file as Zarr.

## 4b. Write first set of files as virtual datasets using the DMRPP reader

In [None]:
mur_sst_files_2003_1 = helpers.list_mur_sst_files(start_date="2003-01-01", end_date="2003-09-10")
mur_sst_files_2003_1_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2003_1]
virtual_ds_2003_1 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2003_1_dmrpps)

In [None]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2003_1.virtualize.to_icechunk(store, append_dim='time')
session.commit("Wrote first part of 2003 data")

In [None]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=["2003-01-01", "2003-09-10"], fs=fs)

## 4c. Write data with different encoding as zarr

In [8]:
%%time
dwz.update(
    repo=repo,
    start_date="2003-09-11 09:00",
    end_date="2003-09-11 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 12.2 s, sys: 1.14 s, total: 13.3 s
Wall time: 1min 40s


In [10]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=["2003-09-11", "2003-09-11"], fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 285.29159611231097
Opening original files...
Computing original files result
Result from original files: 285.29159611231097
CPU times: user 1.19 s, sys: 188 ms, total: 1.38 s
Wall time: 14 s


## 4d. Write the rest of 2003 as virtual data

In [10]:
mur_sst_files_2003_2 = helpers.list_mur_sst_files(start_date="2003-09-12", end_date="2003-12-31")
mur_sst_files_2003_2_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2003_2]
virtual_ds_2003_2 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2003_2_dmrpps)

In [11]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2003_2.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote to end of 2003.")

'7H07YWWBS487QMNP6Z00'

## 4e. Validate

In [12]:
%%time
helpers.validate_data(store, dates=["2003-01-01", "2003-12-31"], fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 284.21366120003546
Opening original files...
Computing original files result
Result from original files: 284.21366120003546
CPU times: user 16.5 s, sys: 2.07 s, total: 18.6 s
Wall time: 6min 21s


# 5. Append 2004

## 5a. List files

In [13]:
dates = ['2004-01-01', '2004-12-31']
mur_sst_files_2004 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2004_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2004]

In [14]:
len(mur_sst_files_2004_dmrpps)

366

## 5b. Write data

In [15]:
virtual_ds_2004 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2004_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2004.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2004 to store.")

'YPQNWBXGRKNYEAD962W0'

## 5c. Validate data

In [16]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 284.13816998533554
Opening original files...
Computing original files result
Result from original files: 284.13816998533554
CPU times: user 16.9 s, sys: 2.33 s, total: 19.2 s
Wall time: 6min 50s


# 6. Let's try 2 years! 2005-2006

## 6a. List files

In [18]:
dates = ['2005-01-01', '2006-12-31']
mur_sst_files_2005_2006 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2005_2006_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2005_2006]

In [19]:
len(mur_sst_files_2005_2006_dmrpps)

730

## 6b. Write data

In [None]:
virtual_ds_2005_2006 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2005_2006_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2005_2006.virtualize.to_icechunk(store, append_dim='time')

In [22]:
session.commit(f"Wrote 2005-2006 to store.")

'55KTAZV8G9DCYBB28WY0'

## 6c. Validate data

In [23]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.7321897241782
Opening original files...
Computing original files result
Result from original files: 283.7321897241782
CPU times: user 1min 10s, sys: 8.4 s, total: 1min 19s
Wall time: 6min 4s


# 7. Let's try 5 years! 2007 through end of 2011

## 7a. List files

In [24]:
dates = ['2007-01-01', '2011-12-31']
mur_sst_files_2007_2011 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2007_2011_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2007_2011]

In [25]:
len(mur_sst_files_2007_2011_dmrpps)

1826

## 7b. Write data

In [26]:
virtual_ds_2007_2011 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2007_2011_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2007_2011.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2007-2011 to store.")

'QDAJRZ75KJXV2S24WV40'

## 7c. Validate data

In [27]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.1182451840052
Opening original files...
Computing original files result




Result from original files: 283.1182451840052
CPU times: user 2min 55s, sys: 19.9 s, total: 3min 15s
Wall time: 14min 40s


# 8. 2012

## 8a. List files

In [28]:
dates = ['2012-01-01', '2012-12-31']
mur_sst_files_2012 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2012_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2012]

In [29]:
len(mur_sst_files_2012_dmrpps)

366

## 8b. Write data

In [35]:
virtual_ds_2012 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2012_dmrpps)

In [36]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2012.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2012 to store.")

'46XR1D48Y7Q2FR7VDZ90'

## 8c. Validate data

In [37]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.2342023237321
Opening original files...
Computing original files result
Result from original files: 283.2342023237321
CPU times: user 41.5 s, sys: 4.25 s, total: 45.7 s
Wall time: 3min 50s


# 9. 2013

## 9a. List files

In [38]:
dates = ['2013-01-01', '2013-12-31']
mur_sst_files_2013 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2013_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2013]

In [39]:
len(mur_sst_files_2013_dmrpps)

365

## 9b. Write data

In [40]:
virtual_ds_2013 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2013_dmrpps)

In [41]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2013.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2013 to store.")

'JG3WFQPYKV4RH2HBNCHG'

## 9c. Validate data

In [42]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.56705489733423
Opening original files...
Computing original files result
Result from original files: 283.56705489733423
CPU times: user 42 s, sys: 4.18 s, total: 46.2 s
Wall time: 3min 52s


# 10. 2014

## 10a. List files

In [43]:
dates = ['2014-01-01', '2014-12-31']
mur_sst_files_2014 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2014_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2014]

In [44]:
len(mur_sst_files_2014_dmrpps)

365

## 10b. Write data

In [45]:
virtual_ds_2014 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2014_dmrpps)

In [52]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2014.virtualize.to_icechunk(store, append_dim='time')

In [53]:
session.commit(f"Wrote 2014 to store.")

'J5PX800H86D7HHPFHH0G'

In [56]:
dates

['2014-01-01', '2014-12-31']

## 10c. Validate data

In [57]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 284.34210170419243
Opening original files...
Computing original files result
Result from original files: 284.34210170419243
CPU times: user 41.3 s, sys: 4.69 s, total: 46 s
Wall time: 3min 45s


# 11. 2015

## 11a. List files

In [9]:
dates = ['2015-01-01', '2015-12-31']
mur_sst_files_2015 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2015_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2015]

In [10]:
len(mur_sst_files_2015_dmrpps)

365

## 11b. Write data

In [11]:
virtual_ds_2015 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2015_dmrpps)

In [None]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2015.virtualize.to_icechunk(store, append_dim='time')

In [13]:
session.commit(f"Wrote 2015 to store.")

'HFACAWG51XVHB89DSB7G'

## 11c. Validate data

In [14]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 284.6076772648599
Opening original files...
Computing original files result
Result from original files: 284.6076772648599
CPU times: user 21.8 s, sys: 3.12 s, total: 24.9 s
Wall time: 3min 11s


# 12. 2016

## 12a. List files

In [15]:
dates = ['2016-01-01', '2016-12-31']
mur_sst_files_2016 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2016_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2016]

In [16]:
len(mur_sst_files_2016_dmrpps)

366

## 12b. Write data

In [17]:
virtual_ds_2016 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2016_dmrpps)

In [18]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2016.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2016 to store.")

'VQ9M648NNE0PS17YJBP0'

In [19]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 284.3394771278134
Opening original files...
Computing original files result
Result from original files: 284.3394771278134
CPU times: user 22.1 s, sys: 2.76 s, total: 24.9 s
Wall time: 3min 8s


# 13. 2017

## 13a. List files

In [20]:
%%time
dates = ['2017-01-01', '2017-12-31']
mur_sst_files_2017 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2017_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2017]

CPU times: user 1.75 ms, sys: 0 ns, total: 1.75 ms
Wall time: 1.73 ms


In [22]:
len(mur_sst_files_2017_dmrpps)

365

## 13b. Write data

In [23]:
%%time
virtual_ds_2017 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2017_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2017.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2017 to store.")

CPU times: user 21 s, sys: 545 ms, total: 21.6 s
Wall time: 27.6 s


'XEBRSF1K42P8KMWFTPG0'

## 13c. Validate data

In [24]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.6460856097813
Opening original files...
Computing original files result
Result from original files: 283.6460856097813
CPU times: user 22.8 s, sys: 3.06 s, total: 25.9 s
Wall time: 3min 5s


# 14. 2018

## 14a. List files

In [25]:
%%time
dates = ['2018-01-01', '2018-12-31']
mur_sst_files_2018 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2018_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2018]

CPU times: user 5.41 ms, sys: 102 μs, total: 5.52 ms
Wall time: 5.44 ms


In [26]:
len(mur_sst_files_2018_dmrpps)

365

## 14b. Write data

In [27]:
%%time
virtual_ds_2018 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2018_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2018.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2018 to store.")

CPU times: user 22.5 s, sys: 1.43 s, total: 23.9 s
Wall time: 29.9 s


'SXQDT5TEM80YH82J6H7G'

## 14c. Validate data

In [28]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.8178223298026
Opening original files...
Computing original files result
Result from original files: 283.8178223298026
CPU times: user 23.5 s, sys: 3.02 s, total: 26.5 s
Wall time: 3min 24s


# 15. 2019

## 15a. List files

In [29]:
%%time
dates = ['2019-01-01', '2019-12-31']
mur_sst_files_2019 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2019_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2019]

CPU times: user 1.72 ms, sys: 0 ns, total: 1.72 ms
Wall time: 1.71 ms


In [30]:
len(mur_sst_files_2019_dmrpps)

365

## 15b. Write data

In [31]:
%%time
virtual_ds_2019 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2019_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2019.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2019 to store.")

CPU times: user 23.8 s, sys: 634 ms, total: 24.4 s
Wall time: 30.9 s


'BGW7PZPP4B37FG32ABJ0'

## 15c. Validate data

In [32]:
%%time
dates = ['2019-01-01', '2019-12-31']
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.99393675478564
Opening original files...
Computing original files result
Result from original files: 283.99393675478564
CPU times: user 23.8 s, sys: 3.06 s, total: 26.9 s
Wall time: 3min 25s


# 16. 2020

## 16a. List files

In [33]:
%%time
dates = ['2020-01-01', '2020-12-31']
mur_sst_files_2020 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2020_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2020]

CPU times: user 1.71 ms, sys: 177 μs, total: 1.89 ms
Wall time: 1.82 ms


In [34]:
len(mur_sst_files_2020_dmrpps)

366

## 16b. Write data

In [35]:
%%time
virtual_ds_2020 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2020_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2020.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2020 to store.")

CPU times: user 25.2 s, sys: 699 ms, total: 25.9 s
Wall time: 32.5 s


'9XVGY1DRN818QQ14P8RG'

## 16c. Validate data

In [36]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.80075408287
Opening original files...
Computing original files result




Result from original files: 283.80075408287
CPU times: user 24.5 s, sys: 2.89 s, total: 27.4 s
Wall time: 3min 28s


# 17. 2021

## 17a. List files with different encodings

In [31]:
mur_sst_files_2021 = helpers.list_mur_sst_files(start_date="2021-01-01", end_date="2021-12-31")
mur_sst_files_2021_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2021]
vdss = [helpers.open_virtual(f) for f in mur_sst_files_2021_dmrpps]
helpers.check_codecs(vdss)

Codec(compressor=None, filters=[{'id': 'shuffle', 'elementsize': 2}, {'id': 'zlib', 'level': 7}])
s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20210220090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc

Codec(compressor=None, filters=[{'id': 'shuffle', 'elementsize': 2}, {'id': 'zlib', 'level': 7}])
s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20210221090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc

Codec(compressor=None, filters=[{'id': 'zlib', 'level': 6}, {'id': 'shuffle', 'elementsize': 2}])
s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20211224090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc

Codec(compressor=None, filters=[{'id': 'zlib', 'level': 6}, {'id': 'shuffle', 'elementsize': 2}])
s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20211225090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc

Codec(compressor=None, filters=[{'id': 'zlib', 'level': 6}, {'id': 'shuffle', 'elementsize': 2}])
s3://podaac-ops-cumulus-protected/MUR-JPL-

**Encoding report:**
* 2021-02-20 and 2021-02-21 use `zlib` level `7`.
* 2021-12-24 to 2021-12-31 implement `shuffle` after `zlib` (the wrong order).

These dates will be written as zarr.

## 17a. List files from first period

In [6]:
%%time
dates = ['2021-01-01', '2021-02-19']
mur_sst_files_2021_1 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2021_1_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2021_1]

CPU times: user 592 μs, sys: 0 ns, total: 592 μs
Wall time: 586 μs


In [7]:
len(mur_sst_files_2021_1_dmrpps)

50

## 17b. Write data

In [8]:
%%time
virtual_ds_2021_1 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2021_1_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2021_1.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2021-01-01 to 2021-02-19 to store.")

CPU times: user 18.6 s, sys: 1.58 s, total: 20.1 s
Wall time: 24.2 s


'PHXYBRNBJ7T70SS6DSVG'

In [11]:
%%time
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 281.3917981101511
Opening original files...
Computing original files result
Result from original files: 281.3917981101511
CPU times: user 14.5 s, sys: 2.71 s, total: 17.2 s
Wall time: 2min 25s


## 17c. Write 2 special days as zarr

In [50]:
%%time
dwz.update(
    repo=repo,
    start_date="2021-02-20 09:00",
    end_date="2021-02-21 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 40.5 s, sys: 2.87 s, total: 43.3 s
Wall time: 2min 36s


In [51]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=["2021-02-20", "2021-02-21"], fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 280.6473787796976
Opening original files...
Computing original files result
Result from original files: 280.6473787796976


## 17d. List files for 2021-02-22 to 2021-12-23

In [9]:
%%time
dates = ['2021-02-22', '2021-12-23']
mur_sst_files_2021_2 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2021_2_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2021_2]

CPU times: user 1.94 ms, sys: 0 ns, total: 1.94 ms
Wall time: 1.92 ms


In [10]:
len(mur_sst_files_2021_2_dmrpps)

305

## 17e. Write data

In [11]:
%%time
virtual_ds_2021_2 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2021_2_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2021_2.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote 2021-02-22 to 2021-12-23 to store.")

CPU times: user 25.4 s, sys: 1.93 s, total: 27.3 s
Wall time: 33.4 s


'FXM1B5MWKS5RWE5DG000'

In [12]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 284.06476043444394
Opening original files...
Computing original files result
Result from original files: 284.06476043444394
CPU times: user 28.1 s, sys: 4.35 s, total: 32.4 s
Wall time: 5min 2s


## 17f. Write the rest of the days as Zarr

In [13]:
%%time
dwz.update(
    repo=repo,
    start_date="2021-12-30 09:00",
    end_date="2021-12-31 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 37.2 s, sys: 2.25 s, total: 39.5 s
Wall time: 2min 59s


* 1 day at a time: With 4 workers, each having 1 thread and 15GB of memory. The maximum amount of memory used I saw was 25%. Some workers did use more than 100% CPU. It took about 2 minutes.
* 2 days at a time: Same worker configuration, maximum memory I saw used was 40%. Often workers exceeded 100% CPU. Took 3 minutes.
* 3 days at a time: Same worker configuration, maximum memory reached 80%, this definitely seems the upper limit. 2 days at a time for 15GB per worker is probably safer.
  * 2 more days after this caused many warnings of memory usage >80%. This is probably because some memory is not released back to the OS: https://distributed.dask.org/en/stable/worker-memory.html#memory-not-released-back-to-the-os.

Each array is at most 5.2GB (17999 * 36000 * 8 bytes), so not sure why workers are using so much memory.

## 17*. Validate data

In [16]:
%%time
dates = ['2021-12-24', '2021-12-31']
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 281.23813660907126
Opening original files...
Computing original files result
Result from original files: 281.23813660907126
CPU times: user 12.8 s, sys: 1.62 s, total: 14.4 s
Wall time: 2min 17s


# 18. 2022

## 18a. List files

In [19]:
%%time
dates = ['2022-01-01', '2022-12-31']
mur_sst_files_2022 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2022_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2022]

CPU times: user 2.1 ms, sys: 29 μs, total: 2.12 ms
Wall time: 2.1 ms


In [20]:
len(mur_sst_files_2022_dmrpps)

365

## 18b. Find files with different encodings

In [33]:
# vdss = [helpers.open_virtual(f) for f in mur_sst_files_2022_dmrpps]
# helpers.check_codecs(vdss)

**Encoding report:**
* 2022-01-01 to 2022-01-26 has `shuffle` after `zlib` (wrong order)
* 2022-11-09 has `zlib` level `7` (`zlib` level `6` is the standard)

## 18c. Write 2022-01-01 to 2022-01-26 as Zarr

In [9]:
%%time
dates = ["2022-01-01", "2022-01-02"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 38 s, sys: 2.62 s, total: 40.6 s
Wall time: 3min 11s


In [10]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 8.11 μs
Open icechunk store...
Computing icechunk store result...
Icechunk store result: 280.6321140658747
Opening original files...
Computing original files result
Result from original files: 280.6321140658747


In [13]:
%%time
dates = ["2022-01-03", "2022-01-04"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 43.3 s, sys: 3.24 s, total: 46.6 s
Wall time: 3min 32s


In [14]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 280.7513176295896
Opening original files...
Computing original files result
Result from original files: 280.7513176295896
CPU times: user 11.7 s, sys: 1.68 s, total: 13.3 s
Wall time: 1min 42s


In [17]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:35699': 1,
 'tcp://127.0.0.1:37293': 1,
 'tcp://127.0.0.1:40995': 1,
 'tcp://127.0.0.1:41687': 1}

In [22]:
%%time
dates = ["2022-01-05", "2022-01-06"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 42.7 s, sys: 1.81 s, total: 44.5 s
Wall time: 3min 22s


In [24]:
%%time
dates = ["2022-01-07", "2022-01-08"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 38 s, sys: 2.46 s, total: 40.5 s
Wall time: 2min 36s


In [25]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:33487': 1,
 'tcp://127.0.0.1:35811': 1,
 'tcp://127.0.0.1:41353': 1,
 'tcp://127.0.0.1:43873': 1}

In [26]:
%%time
dates = ["2022-01-09", "2022-01-10"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 40.6 s, sys: 1.74 s, total: 42.3 s
Wall time: 3min


In [27]:
%%time
dates = ["2022-01-11", "2022-01-12"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 41.6 s, sys: 1.53 s, total: 43.1 s
Wall time: 3min 7s


In [32]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:33487': 1,
 'tcp://127.0.0.1:35811': 1,
 'tcp://127.0.0.1:41353': 1,
 'tcp://127.0.0.1:43873': 1}

In [29]:
%%time
dates = ["2022-01-13", "2022-01-14"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 44.6 s, sys: 1.9 s, total: 46.5 s
Wall time: 3min 37s


In [31]:
%%time
dates = ["2022-01-15", "2022-01-16"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 38.8 s, sys: 1.36 s, total: 40.1 s
Wall time: 2min 37s


In [33]:
%%time
dates = ["2022-01-17", "2022-01-18"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 37.4 s, sys: 1.41 s, total: 38.8 s
Wall time: 2min 35s


In [38]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:33487': 1,
 'tcp://127.0.0.1:35811': 1,
 'tcp://127.0.0.1:41353': 1,
 'tcp://127.0.0.1:43873': 1}

In [35]:
%%time
dates = ["2022-01-19", "2022-01-20"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 41.7 s, sys: 1.81 s, total: 43.5 s
Wall time: 3min 11s


In [37]:
%%time
dates = ["2022-01-21", "2022-01-22"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 37 s, sys: 1.46 s, total: 38.5 s
Wall time: 2min 29s


In [39]:
%%time
dates = ["2022-01-23", "2022-01-24"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 42.1 s, sys: 1.64 s, total: 43.7 s
Wall time: 3min 10s


In [40]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:33487': 1,
 'tcp://127.0.0.1:35811': 1,
 'tcp://127.0.0.1:41353': 1,
 'tcp://127.0.0.1:43873': 1}

In [43]:
%%time
dates = ["2022-01-25", "2022-01-26"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 46.3 s, sys: 3.32 s, total: 49.6 s
Wall time: 3min 17s


In [8]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=["2022-01-01", "2022-01-26"], fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 281.16362468848644
Opening original files...
Computing original files result
Result from original files: 281.16362468848644
CPU times: user 25.7 s, sys: 3.68 s, total: 29.4 s
Wall time: 3min 40s


## 18d. Continue with 2022 until 2022-11-08

In [9]:
%%time
dates = ['2022-01-27', '2022-11-08']
mur_sst_files_2022 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2022_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2022]

CPU times: user 1.72 ms, sys: 0 ns, total: 1.72 ms
Wall time: 1.71 ms


In [10]:
%%time
virtual_ds_2022 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2022_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2022.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote {dates[0]} to {dates[1]} to store.")

CPU times: user 25.6 s, sys: 1.93 s, total: 27.6 s
Wall time: 41 s


'6M3SE8BDWJBHY3EY3PC0'

In [11]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 283.97987703333376
Opening original files...
Computing original files result
Result from original files: 283.97987703333376
CPU times: user 48.8 s, sys: 4.37 s, total: 53.1 s
Wall time: 8min 48s


## 18e. Add 11-09-2022 as zarr

In [12]:
%%time
dates = ["2022-11-09", "2022-11-09"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 39.7 s, sys: 2.64 s, total: 42.4 s
Wall time: 2min 48s


In [13]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 282.9384851511879
Opening original files...
Computing original files result
Result from original files: 282.9384851511879
CPU times: user 17.2 s, sys: 1.73 s, total: 18.9 s
Wall time: 2min 36s


## 18e. Finish 2022 as virtual refs

In [14]:
%%time
dates = ['2022-11-10', '2022-12-31']
mur_sst_files_2022_2 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2022_2_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2022_2]

CPU times: user 506 μs, sys: 44 μs, total: 550 μs
Wall time: 544 μs


In [15]:
len(mur_sst_files_2022_2_dmrpps)

52

In [16]:
%%time
virtual_ds_2022_2 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2022_2_dmrpps)
session = repo.writable_session("main")
store = session.store
virtual_ds_2022_2.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote {dates[0]} to {dates[1]} to store.")

CPU times: user 22.8 s, sys: 721 ms, total: 23.5 s
Wall time: 26.3 s


'XW9W74PEG6NJEMSYS6N0'

In [18]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 281.8355329477488
Opening original files...
Computing original files result
Result from original files: 281.8355329477488
CPU times: user 20.1 s, sys: 2.25 s, total: 22.4 s
Wall time: 3min 26s


In [19]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:40257': 1,
 'tcp://127.0.0.1:45297': 1,
 'tcp://127.0.0.1:46035': 1,
 'tcp://127.0.0.1:46345': 1}

# 19. 2023

**Chunking report:**

We will similarly break up 2023 into some virtual chunks and some zarr chunks. This is because of chunking changes. See https://forum.earthdata.nasa.gov/viewtopic.php?t=5909.

Since the different chunk shape seems to be the default one after 09-04-2023, we will create this store until then and then start a new store. So there will be one store from 06-01-2002 to 09-03-2023 and another starting on 09-04-2023 that will go to the present day.

## 19a. First set of virtual data (2023-01-01 to 2023-02-23)

In [8]:
%%time
dates = ['2023-01-01', '2023-02-23']
mur_sst_files_2023_1 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2023_1_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2023_1]


CPU times: user 640 μs, sys: 0 ns, total: 640 μs
Wall time: 635 μs


In [9]:
len(mur_sst_files_2023_1_dmrpps)

54

In [10]:
virtual_ds_2023_1 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2023_1_dmrpps)

In [11]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2023_1.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote {dates[0]} to {dates[1]} to store.")

'DW0RD39VBNERRQHFYDFG'

In [12]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 281.4492837672986
Opening original files...
Computing original files result
Result from original files: 281.4492837672986
CPU times: user 14.2 s, sys: 2.4 s, total: 16.6 s
Wall time: 3min 1s


## 19b. Append 2023-02-24 to 2023-02-28 as zarr

In [13]:
%%time
dates = ["2023-02-24", "2023-02-25"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 29.1 s, sys: 2.1 s, total: 31.2 s
Wall time: 1min 43s


In [14]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:37685': 1,
 'tcp://127.0.0.1:40693': 1,
 'tcp://127.0.0.1:40893': 1,
 'tcp://127.0.0.1:42951': 1}

In [15]:
%%time
dates = ["2023-02-26", "2023-02-27"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 28.8 s, sys: 1.28 s, total: 30.1 s
Wall time: 1min 47s


In [16]:
helpers.trim_dask_worker_memory(client)

{'tcp://127.0.0.1:37685': 1,
 'tcp://127.0.0.1:40693': 1,
 'tcp://127.0.0.1:40893': 1,
 'tcp://127.0.0.1:42951': 1}

In [17]:
%%time
dates = ["2023-02-28", "2023-02-28"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 26.1 s, sys: 1.88 s, total: 28 s
Wall time: 1min 8s


In [18]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=["2023-02-24", "2023-02-28"], fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 280.4638916306695
Opening original files...
Computing original files result
Result from original files: 280.4638916306695
CPU times: user 12 s, sys: 1.75 s, total: 13.8 s
Wall time: 2min


## 19c. Write 2023-03-01 to 2023-04-21 as virtual refs

In [19]:
%%time
dates = ['2023-03-01', '2023-04-21']
mur_sst_files_2023_2 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2023_2_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2023_2]


CPU times: user 2.84 ms, sys: 0 ns, total: 2.84 ms
Wall time: 2.84 ms


In [20]:
len(mur_sst_files_2023_2_dmrpps)

52

In [21]:
virtual_ds_2023_2 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2023_2_dmrpps)

In [22]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2023_2.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote {dates[0]} to {dates[1]} to store.")

'ASMS9Q7E87WRW445CNFG'

In [23]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=dates, fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 280.85216801482807
Opening original files...
Computing original files result
Result from original files: 280.85216801482807
CPU times: user 13.6 s, sys: 1.83 s, total: 15.4 s
Wall time: 2min 37s


## 19d. Write 2023-04-22 as Zarr

In [24]:
%%time
dates = ["2023-04-22", "2023-04-22"]
dwz.update(
    repo=repo,
    start_date=f"{dates[0]} 09:00",
    end_date=f"{dates[1]} 09:00",
    fs=fs,
    client=client
)

Opening files
Files opened
Mapping write tasks to dask client


  warn(
  warn(
  warn(
  warn(


Starting distributed commit
Distributed commit done
CPU times: user 26.2 s, sys: 605 ms, total: 26.8 s
Wall time: 1min 7s


## 19e. Write 2023-04-23 to 2023-09-03 as virtual refs

In [25]:
%%time
dates = ['2023-04-23', '2023-09-03']
mur_sst_files_2023_3 = helpers.list_mur_sst_files(start_date=dates[0], end_date=dates[1])
mur_sst_files_2023_3_dmrpps = [f + '.dmrpp' for f in mur_sst_files_2023_3]

CPU times: user 811 μs, sys: 69 μs, total: 880 μs
Wall time: 865 μs


In [26]:
len(mur_sst_files_2023_3_dmrpps)

134

In [27]:
virtual_ds_2023_3 = helpers.create_virtual_ds(dmrpps=mur_sst_files_2023_3_dmrpps)

In [28]:
session = repo.writable_session("main")
store = session.store
virtual_ds_2023_3.virtualize.to_icechunk(store, append_dim='time')
session.commit(f"Wrote {dates[0]} to {dates[1]} to store.")

'4MVEPTQ04N3N6GSBNE3G'

In [29]:
%%time
session = repo.readonly_session(branch="main")
store = session.store
helpers.validate_data(store, dates=['2023-04-22', '2023-09-03'], fs=fs)

Open icechunk store...
Computing icechunk store result...
Icechunk store result: 285.40660600951924
Opening original files...
Computing original files result
Result from original files: 285.40660600951924
CPU times: user 21.9 s, sys: 2.02 s, total: 24 s
Wall time: 3min 43s
