In [1]:
import os
import boto3
import math

from botocore.exceptions import NoCredentialsError

S3_BUCKET = "ml-competition-data"
COMPETITION_S3_PATH = "google-research-contrails/"
TRAIN_DATA_S3_PATH = "google-research-contrails/train/"
TRAIN_DATA_DISK_PATH = "../data/train"

s3 = boto3.client("s3")


def load_train_data(bucket, s3_path, local_path, percent=1):
    # TODO: Load the train_metadata.json file from root competition data s3 path
    #       Change to use s3 paginator to look through all training data in s3 bucket
    try:
        all_subdirs = []
        result = s3.list_objects(Bucket=bucket, Delimiter="/", Prefix=s3_path)
        for sub_dir in result.get("CommonPrefixes"):
            all_subdirs.append(sub_dir["Prefix"])

        num_to_load = math.ceil(len(all_subdirs) * percent / 100)
        print(f"Loading {num_to_load} training data points froms s3")

        for subdir in all_subdirs[:num_to_load]:
            for obj in s3.list_objects(Bucket=bucket, Prefix=subdir)["Contents"]:
                # Strip s3_path from filenames
                local_filename = os.path.join(local_path, obj["Key"]).replace(
                    s3_path, ""
                )
                print(local_filename)
                if not os.path.exists(os.path.dirname(local_filename)):
                    os.makedirs(os.path.dirname(local_filename))
                s3.download_file(bucket, obj["Key"], local_filename)

    except NoCredentialsError:
        print("No AWS credentials found")


if not os.listdir(TRAIN_DATA_DISK_PATH):
    print("Loading batch of training data...")
    load_train_data(S3_BUCKET, TRAIN_DATA_S3_PATH, TRAIN_DATA_DISK_PATH)
else:
    print("A batch of training data already exists")

Loading batch of training data...
Loading 10 training data points froms s3
../data/train/1000216489776414077/band_08.npy
../data/train/1000216489776414077/band_09.npy
../data/train/1000216489776414077/band_10.npy
../data/train/1000216489776414077/band_11.npy
../data/train/1000216489776414077/band_12.npy
../data/train/1000216489776414077/band_13.npy
../data/train/1000216489776414077/band_14.npy
../data/train/1000216489776414077/band_15.npy
../data/train/1000216489776414077/band_16.npy
../data/train/1000216489776414077/human_individual_masks.npy
../data/train/1000216489776414077/human_pixel_masks.npy
../data/train/1000603527582775543/band_08.npy
../data/train/1000603527582775543/band_09.npy
../data/train/1000603527582775543/band_10.npy
../data/train/1000603527582775543/band_11.npy
../data/train/1000603527582775543/band_12.npy
../data/train/1000603527582775543/band_13.npy
../data/train/1000603527582775543/band_14.npy
../data/train/1000603527582775543/band_15.npy
../data/train/100060352758