# Accessing and storing data on (EPFL) S3

* As a filesystem with S3Fs and fsspec.
* As a storage for n-d arrays with zarr and xarray.

## S3Fs

* Pythonic filesystem interface to S3
* <https://s3fs.readthedocs.io/en/latest/index.html>

In [None]:
import s3fs

In [None]:
fs = s3fs.S3FileSystem(
    key='L6VMMUCY3DCGQJB5AFWS',
    secret='+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    client_kwargs={
       'endpoint_url': 'https://s3.epfl.ch'
    }
)

print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322'))
with fs.open('10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', 'r') as f:
    print(f.read())

In [None]:
# With credentials passed to botocore directly.
fs = s3fs.S3FileSystem(client_kwargs={  
    'endpoint_url': 'https://s3.epfl.ch',
    'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
    'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
})

print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322'))
with fs.open('10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', 'r') as f:
    print(f.read())

In [None]:
# Botocore will read the following environment variables to get credentials.
# That's better than storing them in code.
#os.environ['AWS_ACCESS_KEY_ID']
#os.environ['AWS_SECRET_ACCESS_KEY']

## fsspec

* generic remote filesystem interface (uses and used by s3fs)
* <https://filesystem-spec.readthedocs.io/en/latest/usage.html>

In [None]:
import fsspec

In [None]:
fs = fsspec.filesystem(
    's3',
    client_kwargs={
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }
)

print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322'))
with fs.open('10380-c19e273816a6aca044c096f3a6d4d322/hello.txt', 'r') as f:
    print(f.read())

In [None]:
of = fsspec.open(
    's3://10380-c19e273816a6aca044c096f3a6d4d322/hello.txt',
    client_kwargs={
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }
)

with of as f:
    print(f.read())

## zarr

* data format that is more distributed / cloud friendly than netCDF/HDF5 because chunks are separate files
* can be saved to filesystem (disk) or object store (cloud, database)
* <https://zarr.readthedocs.io/en/stable/tutorial.html#distributed-cloud-storage>

In [None]:
import zarr
import s3fs
import numpy as np

In [None]:
# Open the S3 store with S3Fs.
fs = s3fs.S3FileSystem(client_kwargs={  
    'endpoint_url': 'https://s3.epfl.ch',
    'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
    'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
})
store = s3fs.S3Map('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', fs)

# Optional cache.
#store = zarr.LRUStoreCache(store, max_size=2**28)

# Open the root group.
root = zarr.group(store)

In [None]:
# Write and read attributes.
root.attrs['readme'] = 'Demo zarr store on EPFL S3'
print(list(root.attrs.items()))

In [None]:
# Create some data.
a = np.arange(10e7, dtype='i4').reshape((10000, 10000))
z = zarr.array(a, chunks=(1000, 1000))
print(z.info)

# Store it.
root['foo/bar'] = z

# A zarr array is a directory of chunks (with a JSON metadata file).
print('{} files\n'.format(len(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr/foo/bar'))))

# Some info about our hierarchy of groups.
print(root.info)
print(root.tree())

# Access the data back.
z = root['foo/bar']
z.info

In [None]:
# Open the group with fsspec.
g = zarr.open_group(
    's3://10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr',
    storage_options={'client_kwargs': {
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }}
)
g['foo/bar'].info

In [None]:
# Open the group with fsspec and a cache.
g = zarr.open_group(
    'simplecache::s3://10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr',
    storage_options={'s3': {'client_kwargs': {
       'endpoint_url': 'https://s3.epfl.ch',
       'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
       'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
    }}}
)
g['foo/bar'].info

In [None]:
# Subsequent access is faster if opened with cache.
g['foo/bar'][:]

In [None]:
# Delete store.
fs.rm('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', recursive=True)

## xarray

* n-dimensional labeled array
* supports multiple storage backends, including zarr (but also netCDF or GRIB)
* <https://xarray.pydata.org/en/stable/io.html#zarr>

In [None]:
import xarray as xr
import pandas as pd

In [None]:
# Open the S3 store with S3Fs.
fs = s3fs.S3FileSystem(client_kwargs={  
    'endpoint_url': 'https://s3.epfl.ch',
    'aws_access_key_id': 'L6VMMUCY3DCGQJB5AFWS',
    'aws_secret_access_key': '+zM+rME107dXsyJf1Dxa8BBePMLF1ZbmAz+GJ91h',
})
store = s3fs.S3Map('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', fs)

In [None]:
ds = xr.Dataset(
    {"foo": (("x", "y"), np.random.rand(4, 5))},
    coords={
        "x": [10, 20, 30, 40],
        "y": pd.date_range("2000-01-01", periods=5),
        "z": ("x", list("abcd")),
    },
)

In [None]:
ds.to_zarr(store, mode='w', consolidated=True)

In [None]:
ds2 = xr.open_zarr(store, consolidated=True)
ds2['foo']

In [None]:
print(fs.ls('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr'))  # doesn't show directories
fs.rm('10380-c19e273816a6aca044c096f3a6d4d322/demo.zarr', recursive=True)