In [1]:
import earthaccess
import s3fs
import xarray as xr

collection_short_name = 'MUR-JPL-L4-GLOB-v4.1'
bucket = 'veda-data-store-staging'
store_name = f'{collection_short_name}.zarr'
target_chunks = { 'time': 1, 'lat': 1800, 'lon': 3600 }

In [2]:
earthaccess.login()

EARTHDATA_USERNAME and EARTHDATA_PASSWORD are not set in the current environment, try setting them or use a different strategy (netrc, interactive)
No .netrc found in /home/jovyan


Enter your Earthdata Login username:  aimeeb
Enter your Earthdata password:  ········


You're now authenticated with NASA Earthdata Login
Using token with expiration date: 12/19/2023
Using user provided credentials for EDL


<earthaccess.auth.Auth at 0x7efcbc808580>

In [3]:
aws_creds = earthaccess.get_s3_credentials('PODAAC')

In [4]:
results = earthaccess.search_data(
    short_name=collection_short_name,
    cloud_hosted=True,
    count=12
)

Granules found: 7810


In [5]:
data_links = [r.data_links(access='direct')[0] for r in results]
data_links

['s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020602090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020603090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020604090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020605090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020606090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020607090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020608090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc',
 's3://podaac-ops-cumulus-protected/MUR-

In [6]:
s3_fs = s3fs.S3FileSystem(
    key=aws_creds['accessKeyId'],
    secret=aws_creds['secretAccessKey'],
    token=aws_creds['sessionToken'], 
    anon=False
)

## time without dask

In [27]:
%%time
store = s3fs.S3Map(root=f"{bucket}/{store_name}", s3=s3fs.S3FileSystem())
fileset = [s3_fs.open(file) for file in data_links]
data = xr.open_mfdataset(fileset, combine='by_coords')
data_chunked = data.chunk(target_chunks)
# data_chunked.to_zarr(store, mode='w')

CPU times: user 1.09 s, sys: 3.08 ms, total: 1.09 s
Wall time: 1.86 s


## Append data

In [7]:
import s3fs
import zarr
existing_dataset_store = s3fs.S3Map(root=f"s3://{bucket}/{store_name}", s3=s3fs.S3FileSystem())
existing_dataset = xr.open_dataset(existing_dataset_store, engine='zarr')
existing_dataset

In [17]:
file_to_append = data_links[11]
f = s3_fs.open(file_to_append, mode='rb')
ds_to_append = xr.open_dataset(f)
ds_to_append_chunked = ds_to_append.chunk(target_chunks)

In [18]:
new_time = ds_to_append_chunked['time'][0]
new_time

In [19]:
new_time_value = new_time.values

In [20]:
import numpy as np
time_values = existing_dataset['time'].values
indices = np.where(time_values == time_values[-1])[0]

In [22]:
if len(indices) > 1:
    time_values[indices[1]] = new_time_value

In [23]:
time_values

array(['2002-06-01T09:00:00.000000000', '2002-06-02T09:00:00.000000000',
       '2002-06-03T09:00:00.000000000', '2002-06-04T09:00:00.000000000',
       '2002-06-05T09:00:00.000000000', '2002-06-06T09:00:00.000000000',
       '2002-06-07T09:00:00.000000000', '2002-06-08T09:00:00.000000000',
       '2002-06-09T09:00:00.000000000', '2002-06-10T09:00:00.000000000',
       '2002-06-11T09:00:00.000000000', '2002-06-12T09:00:00.000000000'],
      dtype='datetime64[ns]')

In [27]:
str(new_time_value)

'2002-06-12T09:00:00.000000000'

In [None]:
%%time
# Replace the old time coordinate with the modified one
existing_dataset = existing_dataset.assign_coords(time=time_values)

for var in ds_to_append_chunked.data_vars.keys():
    existing_dataset[var].loc[{'time': str(new_time_value)}] = ds_to_append_chunked[var][0]

# Write the modified dataset back to the Zarr store
existing_dataset.to_zarr(existing_dataset_store, mode='w')