# Base S3 Bucket preparation and training data download to Jupyter Notebook Pod

#### This Notebook sets the scene for the base bucket and then downloads available x-ray images for training to the pod for the execution of the ML training notebook.

#### Note! The actual upload of train/test/validation files is outside the scope of this notebook. Plese use aws cli to upload the dataset to the bucket.

#### Important: This demo uses Rados GW and the below function is provided as guidance if one wishes to adapt the demo to use pure AWS S3 service instead.

In [None]:
import boto3
import botocore
import json
import os
import tqdm

In [None]:
# direct keys to S3 and not to the ODF storage instance from OCP4
aws_access_key_id = '3XWIR321K1ERLV4FXK5D'
aws_secret_access_key = 'Fg9c9MRoKzZ41sad7xGurblqmYe0XaKtrD2ZL0ve'
region_name = 'default' #default region for the profile e.g., us-east-2

# To reduce external traffic one can use the internal cluster service endpoint - it should look something like below and can be obtained from the openshift-storage namespace
# endpoint_url = 'http://rook-ceph-rgw-ocs-storagecluster-cephobjectstore.openshift-storage.svc.cluster.local'

# The external rados GW endpoint can be obtained from the networking-routes section of the administrator view having openshift-storage selected as project.
endpoint_url = 'https://rgw-openshift-storage.apps.cluster-lv628.lv628.sandbox1664.opentlc.com/'

In [None]:
def create_bucket_aws(bucket_name):
    location = {'LocationConstraint': region_name}
    result = s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location)
    return result

In [None]:
def create_bucket_rados(bucket_name):
    result = s3.create_bucket(Bucket=bucket_name)
    return result

In [None]:
# S3 AWS client - if not using Rados GW. This demo uses Rados GW and the below function is provided as guidance if one wishes to adapt the demo to use pure AWS S3 service instead.
s3_aws = boto3.client('s3',
                  aws_access_key_id = aws_access_key_id,
                  aws_secret_access_key = aws_secret_access_key,
                  region_name = region_name)

In [None]:
# Rados GW S3 compatible client
s3 = boto3.client('s3',
                endpoint_url = endpoint_url,
                aws_access_key_id = aws_access_key_id,
                aws_secret_access_key = aws_secret_access_key,
                region_name = 'default',
                config=botocore.client.Config(signature_version = 's3'))

### Create required buckets and set permissions. 
Optionally, you can change the name of the bucket, though ensure to replace the new name in all instances where you use it (this file included)

#### Note: run the below cell only once!

bucket_base_name: the bucket used by the image uploader utility to drop new x-ray images. This is the base bucket used as input by the ML prediction service.</br>
bucket_base_name + '-train-test-valid' : the bucket used for training a new model, if so desired. The project has an example trained model ready for use.</br>
bucket_base_name + '-datasource': the bucket where the image uploader utility will take new images to simulate an upload.</br>
bucket_base_name + '-processed': the bucket where the ML prediction service will move evaluated x-ray images

In [None]:
bucket_base_name = 'ml-pneumonia'

In [None]:
create_bucket_rados(bucket_base_name)
create_bucket_rados(bucket_base_name + '-train-test-valid')
create_bucket_rados(bucket_base_name + '-datasource')
create_bucket_rados(bucket_base_name + '-processed')

In [None]:
response = s3.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

In [None]:
for bucket in s3.list_buckets()['Buckets']:
    bucket_policy = {
                      "Version":"2012-10-17",
                      "Statement":[
                        {
                          "Sid":"AddPerm",
                          "Effect":"Allow",
                          "Principal": "*",
                          "Action":["s3:GetObject"],
                          "Resource":["arn:aws:s3:::{0}/*".format(bucket['Name'])]
                        }
                      ]
                    }
    bucket_policy = json.dumps(bucket_policy)
    s3.put_bucket_policy(Bucket=bucket['Name'], Policy=bucket_policy)

In [None]:
for bucket in s3.list_buckets()['Buckets']:
    print(bucket['Name'])

### Section for downloading files to the Juphyter Pod. Use only if you want to retrain the ML algorithm

In [None]:
def download_dir_rados(aws_access_key_id, aws_secret_access_key, region_name,  bucket, s3_prefix = '', local_base = ''):
    """
    params:
    - aws_access_key_id: The aws_access_key_id
    - aws_secret_access_key: The aws_secret_access_key
    - region_name: The region where the bucket was created
    - bucket: s3 bucket with target contents
    - s3_prefix: pattern to match in s3
    - local_base: local path to folder in which to place files
    """
    
    s3_resource = boto3.resource(service_name= 's3',
                                 endpoint_url = endpoint_url,
                                 aws_access_key_id = aws_access_key_id,
                                 aws_secret_access_key = aws_secret_access_key,
                                 region_name = region_name,
                                 config=botocore.client.Config(signature_version = 's3')
                                )
    
    ml_ds_bucket = s3_resource.Bucket(bucket)
    bucket_objects = ml_ds_bucket.objects.all()
    
    files = []
    for item in bucket_objects:
        files.append(item.key)

    print(f'Downloading files...')
    for file in tqdm.tqdm(files):
        dest_pathname = os.path.join(local_base, file)
        if not os.path.exists(os.path.dirname(dest_pathname)):
            os.makedirs(os.path.dirname(dest_pathname))
        ml_ds_bucket.download_file(file, dest_pathname)
        
    print(f'Done!')

In [None]:
def download_dir_aws(aws_access_key_id, aws_secret_access_key, region_name,  bucket, s3_prefix = '', local_base = ''):
    """
    params:
    - aws_access_key_id: The aws_access_key_id
    - aws_secret_access_key: The aws_secret_access_key
    - region_name: The region where the bucket was created
    - bucket: s3 bucket with target contents
    - s3_prefix: pattern to match in s3
    - local_base: local path to folder in which to place files
    """
    
    s3_resource = boto3.resource('s3',
                             aws_access_key_id = aws_access_key_id,
                             aws_secret_access_key = aws_secret_access_key,
                             region_name = region_name)
    
    ml_ds_bucket = s3_resource.Bucket(bucket)
    bucket_objects = ml_ds_bucket.objects.all()
    
    files = []
    for item in bucket_objects:
        files.append(item.key)

    print(f'Downloading files...')
    for file in tqdm.tqdm(files):
        dest_pathname = os.path.join(local_base, file)
        if not os.path.exists(os.path.dirname(dest_pathname)):
            os.makedirs(os.path.dirname(dest_pathname))
        ml_ds_bucket.download_file(file, dest_pathname)
        
    print(f'Done!')

#### The below cell will download to this pod in the (new) dataset folder the contents of the S3 bucket.

In [None]:
download_dir_rados(aws_access_key_id = aws_access_key_id,
              aws_secret_access_key = aws_secret_access_key,
              region_name = region_name,
              bucket = 'ml-pneumonia--train-test-valid,
              s3_prefix = '',
              local_base = 'dataset')

Final check to ensure the number of files matches the one from the bucket.

In [None]:
!ls -lR dataset | wc -l