# Object Storage

DIAS instances store both the original Sentinel data (as downloaded from ESA Copernicus hub) and the processed CARD products on object storage. The CREODIAS, MUNDI and SOBLOO instances use S3 storage, which is similar to the storage solution used by Amazon Web Services (AWS). ONDA uses ENS. In all cases, there is a need to discover and transfer the required product from object storage to the disks attached to the VMs, in order to process the data.

S3 store access requires access credentials that are typically created at DIAS account setup. The credentials, together with the S3 host and bucket details need to be provided to the access protocol that is used to access the data, for instance, inside a python script that transfers the data to disk. 


In [1]:
# import required libraries for this Notebook
import os
import boto3
import botocore
import rasterio
from ipywidgets import widgets, HBox, Layout


In [2]:
# Set the s3 credentials
ACCESS_KEY = ""
SECRET_KEY = ""
S3HOST = ""
BUCKET = ""
SERVICE_PROVIDER = ''

# Conect to s3 store
session = boto3.session.Session(aws_access_key_id=ACCESS_KEY,
                                aws_secret_access_key=SECRET_KEY)

In [3]:
# list_buckets
def get_buckets(print_list=False):
    """Retrieve a list of existing buckets in the object storage"""
    try:
        s3 = session.client('s3', endpoint_url=S3HOST)

        buckets = s3.list_buckets()
        bucket_list = [BUCKET]
        for bucket_ in buckets['Buckets']:
            bucket_list.append(bucket_['Name'])
        if print_list is True:
            for b_ in buckets['Buckets']:
                print(b_)
        return bucket_list

    except Exception as err:
        print(f"Could not retrieve list of the buckets: {err}")
get_buckets()

['daas-perf',
 'daas-prod-css',
 'dias-ops-xchange',
 'dias-perf-srtmv41',
 'sps-input']

**Note: Boto3 can not retrieve public buckets. If the bucket is public set the bucket name manually in the 'Buckets' widget or set the 'BUCKET' variable.**


In [4]:
# Set search parameters

bucket_list = widgets.Combobox(value=None, placeholder='select a backet',
    options=get_buckets(), description='Buckets:', ensure_option=True)
dias = widgets.Dropdown(
    options=['', 'CREODIAS', 'MUNDI', 'SOBLOO', 'ONDA', 'WEKEO', 'EOSC'],
    description='DIAS Provider: ', value='')
only_images = widgets.Checkbox(value=True, description='Show only images')
check_for_hdr = widgets.Checkbox(value=False, description='Check for header file')
prefix = widgets.Text(placeholder='Prefix (Path)', description='Prefix:', layout=Layout(width='50%'))

options = widgets.VBox([widgets.HBox([dias, bucket_list]), prefix, widgets.HBox([only_images, check_for_hdr])])
options

# To filter the results use a prefix path (e.g. 'Sentinel-2/MSI/L1C/2018/12/08/', 'S2B/ ').
# SOBLOO example:
# das-add-data/Breizh-S2/tiles/30/U/WU/S2A_MSIL1C_20190316T112111_N0207_R037_T30UWU_20190316T132306.SAFE

VBox(children=(HBox(children=(Dropdown(description='DIAS Provider: ', index=3, options=('', 'CREODIAS', 'MUNDI…

In [5]:
# Get a list of files in the selected backet and location
def get_file_list(prefix='', print_list=False):
    """Displays the contents of a single bucket"""
    try:
        file_list =[]
        s3 = session.client('s3', endpoint_url=S3HOST)
        bucket_files = s3.list_objects_v2(
            Bucket=bucket_list.value, Prefix=prefix)['Contents']
        if print_list is True:
            for key in bucket_files:
                print(key)
        if only_images.value is True:
            for f in bucket_files:
                file = f['Key'].replace(prefix, '', 1)
                if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff',
                                           '.bmp', '.gif', '.tif', '.img', 'jp2')):
                    file_list.append(file)
        else:
            file_list = [f['Key'] for f in bucket_files]
        return file_list
    except Exception as err:
        print(
            f"Could not retrieve list of the files from the selected buckets: {err}")


file_list = widgets.Dropdown(options=get_file_list(prefix.value), description='Bucket files:', layout=Layout(width='50%'))
file_list

Dropdown(description='Bucket files:', layout=Layout(width='50%'), options=('T30UWU_20170601T110651_B01.jp2', '…

**Note:** The image to be recognized as 'geo' image, depending of the file type, it may need an extra headers file that contains the geographic information of the image, the headers information file usually have an .hdr extension.

In case it is needed to download the headers file, check the the checkbox "Check for header file" to try download the headers file.

In [8]:
# A sample script to test downloading from s3 object storage.
# Select a new name for the file (e.g. 'sample_image').
file_name = "sample_image_from_object_storage"

# Get the selected image.
s3image = f"{prefix.value}{file_list.value}"

# Name of the image file to be stored in data folder
localimg = f"temp/{file_name}.img"

os.makedirs("temp", exist_ok=True)
print("-- File to be downloaded from object storage:\n", s3image, '\n')
print("-- The file will be stored in:\n", localimg)
print()

-- File to be downloaded from object storage:
 das-add-data/Breizh-S2/tiles/30/U/WU/S2A_MSIL1C_20170601T110651_N0205_R137_T30UWU_20170601T111225.SAFE/GRANULE/L1C_T30UWU_A010144_20170601T111225/IMG_DATA/T30UWU_20170601T110651_B03.jp2 

-- The file will be stored in:
 temp/sample_image_from_object_storage.img



In [9]:
# Download image and image information file
def get_file(s3file, localfile, bucket_name, progress_bar=False, to_memory=False, status=False):
    
    """Download a file from the s3 storage"""
    s3 = session.resource('s3', endpoint_url=S3HOST)
    bucket_ = s3.Bucket(bucket_name)
    object_ = bucket_.Object(s3file)

    try:
        s3.Bucket(bucket_name).download_file(s3file, localfile)
        print("File downloaded as: ", localfile)
        return 0
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
            return 1
        else:
            raise

get_file(s3image, localimg, bucket_list.value)

# Get the selected image header file (Comment if not applicable).
if check_for_hdr.value is True:
    try:
        s3header = os.path.splitext(s3image)[0] + ".hdr"
        localhdr = f"temp/{file_name}.hdr"
        get_file(s3header, localhdr, bucket_name.value)
    except Exception:
        print("No header file found.")


File downloaded as:  temp/sample_image_from_object_storage.img


**Run the below cell to check if the image is correctly downloaded and recognized as an geo image file.**

In [10]:
# Get information for the downloaded raster image
with rasterio.open(localimg) as src:
    print(src.width, src.height)
    print(src.crs)
    print(src.transform)
    print(src.count)
    print(src.indexes)

10980 10980
EPSG:32630
| 10.00, 0.00, 499980.00|
| 0.00,-10.00, 5400000.00|
| 0.00, 0.00, 1.00|
1
(1,)
