# Accessing and storing data on (EPFL) S3

* As a filesystem with S3Fs and fsspec.
* As a storage for n-d arrays with zarr and xarray.

## S3Fs

* Pythonic filesystem interface to S3
* <https://s3fs.readthedocs.io/en/latest/index.html>

In [1]:
import s3fs

In [2]:
fs = s3fs.S3FileSystem(
    key='L6VMMUCY3DCGQJB5AFWS',
    secret='+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    client_kwargs={
       'endpoint_url': 'https://s3.epfl.ch'
    }
)

print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322'))
with fs.open('10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', 'r') as f:
    print(f.read())

['10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', '10380-c19e273816a6aca044c096f3a6d4d322/zarr-demo-store']
Hello from EPFL S3.



In [3]:
# With credentials passed to botocore directly.
fs = s3fs.S3FileSystem(client_kwargs={  
    'endpoint_url': 'https://s3.epfl.ch',
    'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
    'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
})

print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322'))
with fs.open('10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', 'r') as f:
    print(f.read())

['10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', '10380-c19e273816a6aca044c096f3a6d4d322/zarr-demo-store']
Hello from EPFL S3.



In [4]:
# Botocore will read the following environment variables to get credentials.
# That's better than storing them in code.
#os.environ['AWS_ACCESS_KEY_ID']
#os.environ['AWS_SECRET_ACCESS_KEY']

## fsspec

* generic remote filesystem interface (uses and used by s3fs)
* <https://filesystem-spec.readthedocs.io/en/latest/usage.html>

In [5]:
import fsspec

In [6]:
fs = fsspec.filesystem(
    's3',
    client_kwargs={
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }
)

print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322'))
with fs.open('10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', 'r') as f:
    print(f.read())

['10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', '10380-c19e273816a6aca044c096f3a6d4d322/zarr-demo-store']
Hello from EPFL S3.



In [7]:
of = fsspec.open(
    's3://10380-c19e273816a6aca044c096f3a6d4d322/hello.txt',
    client_kwargs={
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }
)

with of as f:
    print(f.read())

b'Hello from EPFL S3.\n'


## zarr

* data format that is more distributed / cloud friendly than netCDF/HDF5 because chunks are separate files
* can be saved to filesystem (disk) or object store (cloud, database)
* <https://zarr.readthedocs.io/en/stable/tutorial.html#distributed-cloud-storage>

In [8]:
import zarr
import s3fs
import numpy as np

In [9]:
# Open the S3 store with S3Fs.
fs = s3fs.S3FileSystem(client_kwargs={  
    'endpoint_url': 'https://s3.epfl.ch',
    'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
    'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
})
store = s3fs.S3Map('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', fs)

# Optional cache.
#store = zarr.LRUStoreCache(store, max_size=2**28)

# Open the root group.
root = zarr.group(store)

In [10]:
# Write and read attributes.
root.attrs['readme'] = 'Demo zarr store on EPFL S3'
print(list(root.attrs.items()))

[('readme', 'Demo zarr store on EPFL S3')]


In [11]:
# Create some data.
a = np.arange(10e7, dtype='i4').reshape((10000, 10000))
z = zarr.array(a, chunks=(1000, 1000))
print(z.info)

# Store it.
root['foo/bar'] = z

# A zarr array is a directory of chunks (with a JSON metadata file).
print('{} files\n'.format(len(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr/foo/bar'))))

# Some info about our hierarchy of groups.
print(root.info)
print(root.tree())

# Access the data back.
z = root['foo/bar']
z.info

Type               : zarr.core.Array
Data type          : int32
Shape              : (10000, 10000)
Chunk shape        : (1000, 1000)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type         : builtins.dict
No. bytes          : 400000000 (381.5M)
No. bytes stored   : 4684636 (4.5M)
Storage ratio      : 85.4
Chunks initialized : 100/100

101 files

Name        : /
Type        : zarr.hierarchy.Group
Read-only   : False
Store type  : fsspec.mapping.FSMap
No. members : 1
No. arrays  : 0
No. groups  : 1
Groups      : foo

/
 └── foo
     └── bar (10000, 10000) int32


0,1
Name,/foo/bar
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,fsspec.mapping.FSMap
No. bytes,400000000 (381.5M)


In [12]:
# Open the group with fsspec.
g = zarr.open_group(
    's3://10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr',
    storage_options={'client_kwargs': {
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }}
)
g['foo/bar'].info

0,1
Name,/foo/bar
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.FSStore
No. bytes,400000000 (381.5M)


In [13]:
# Open the group with fsspec and a cache.
g = zarr.open_group(
    'simplecache::s3://10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr',
    storage_options={'s3': {'client_kwargs': {
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }}}
)
g['foo/bar'].info

0,1
Name,/foo/bar
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.FSStore
No. bytes,400000000 (381.5M)


In [14]:
# Subsequent access is faster if opened with cache.
g['foo/bar'][:]

array([[       0,        1,        2, ...,     9997,     9998,     9999],
       [   10000,    10001,    10002, ...,    19997,    19998,    19999],
       [   20000,    20001,    20002, ...,    29997,    29998,    29999],
       ...,
       [99970000, 99970001, 99970002, ..., 99979997, 99979998, 99979999],
       [99980000, 99980001, 99980002, ..., 99989997, 99989998, 99989999],
       [99990000, 99990001, 99990002, ..., 99999997, 99999998, 99999999]],
      dtype=int32)

In [15]:
# Delete store.
fs.rm('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', recursive=True)

## xarray

* n-dimensional labeled array
* supports multiple storage backends, including zarr (but also netCDF or GRIB)
* <https://xarray.pydata.org/en/stable/io.html#zarr>

In [16]:
import xarray as xr
import pandas as pd

In [17]:
# Open the S3 store with S3Fs.
fs = s3fs.S3FileSystem(client_kwargs={  
    'endpoint_url': 'https://s3.epfl.ch',
    'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
    'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
})
store = s3fs.S3Map('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', fs)

In [18]:
ds = xr.Dataset(
    {"foo": (("x", "y"), np.random.rand(4, 5))},
    coords={
        "x": [10, 20, 30, 40],
        "y": pd.date_range("2000-01-01", periods=5),
        "z": ("x", list("abcd")),
    },
)

In [19]:
ds.to_zarr(store, mode='w', consolidated=True)

<xarray.backends.zarr.ZarrStore at 0x7f9889f2ed00>

In [20]:
ds2 = xr.open_zarr(store, consolidated=True)
ds2['foo']

In [21]:
print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr'))  # doesn't show directories
fs.rm('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', recursive=True)

['10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr/.zattrs', '10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr/.zgroup']
