# Download files from AWS Registry of Open Data

This notebook allows to download files from a particular dataset stored in AWS's S3, given its Amazon Resource Name (ARN)

See https://registry.opendata.aws/

The downloading process relies on the usage of the package boto3. See also

https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html


PS. Feel free to adjust the code

In [None]:
! pip3 install boto3


In [None]:
import os
import boto3


In [None]:
from botocore import UNSIGNED
from botocore.client import Config


In [None]:
# Print out the objects' name in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # print out name
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            print(filename)
        

In [None]:
# Print out the objects' name in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        filename = obj.key
        if name_word == "" or name_word in filename:  
            print(filename)
            

In [None]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, filename)
        

In [None]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder_with_complete_name(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        complete_filename = path.replace('/', '|') + '|' + filename
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, complete_filename)
        

In [None]:
# Download all S3 objects in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # download file into current directory
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            my_bucket.download_file(s3_object.key, filename)
        

In [None]:
# Initiate S3 resource

s3_resource = boto3.resource('s3', config=Config(signature_version=UNSIGNED))


In [None]:
# Set S3 resource name of interest

BUCKET_NAME = '...' 


In [None]:
# Particular word in the filename, if it is of interest
# e.g. csv, json, parquet

WORD_IN_FILENAME = '...' 


In [None]:
# print_all_objects_name(s3_resource, BUCKET_NAME)

# print_all_objects_name(s3_resource, BUCKET_NAME, WORD_IN_FILENAME)


In [None]:
# Set out the related path, if it is of interest

PATH_NAME = '..'


In [None]:
# print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

# print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)


In [None]:
! pwd

In [None]:
! ls -la

In [None]:
# download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

# download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

# download_all_objects_in_folder_with_complete_name(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

In [None]:
! ls -la

Some notes