# Comparing Accessing S3 Bucket Directly vs. Mounting it as a Paperspace Gradient Data Source

In [1]:
import os
from pathlib import Path
import random
from time import time
import warnings

import pandas as pd
import rasterio
from rasterio.io import MemoryFile
import s3fs

In [2]:
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)
os.environ['AWS_NO_SIGN_REQUEST'] = 'YES'

## Accessing S3 as a mounted data source

In [3]:
DATASET_DIR = Path("/datasets/biomassters")
features_metadata_csv = DATASET_DIR / "features_metadata.csv"
train_agbm_csv = DATASET_DIR / "train_agbm_metadata.csv"
train_images_dir = DATASET_DIR / "train_features"
train_agbm_dir = DATASET_DIR / "train_agbm"
test_images_dir = DATASET_DIR / "test_features"

In [4]:
# List files in training dir
start = time()
train_images = os.listdir(train_images_dir)
end = time()
print(f"Total images in training set: {len(train_images)}")
print(f"Time taken: {end - start}")

Total images in training set: 189078
Time taken: 23.981640338897705


In [5]:
random_images = [train_images[r] for r in random.choices(range(len(train_images)), k=10)]

In [6]:
# Avg. read time for a random selection of images
total_time = 0.0
for img in random_images:
    img = train_images_dir / img
    start = time()
    with rasterio.open(img) as f:
        img_data = f.read()
        end = time()
        read_time = end - start
        total_time += read_time
        print(f"Image: {img},  Shape: {img_data.shape}")
        print(f"Read time: {read_time}")
print(f"Total read time for {len(random_images)} images at random = {total_time}")
print(f"Avg. time to read an image = {total_time / len(random_images)}")

Image: /datasets/biomassters/train_features/cd0d6ae2_S1_09.tif,  Shape: (4, 256, 256)
Read time: 1.4312479496002197
Image: /datasets/biomassters/train_features/280c8135_S2_06.tif,  Shape: (11, 256, 256)
Read time: 1.3975193500518799
Image: /datasets/biomassters/train_features/9c84f5a2_S1_00.tif,  Shape: (4, 256, 256)
Read time: 1.3927698135375977
Image: /datasets/biomassters/train_features/e02bc68f_S2_05.tif,  Shape: (11, 256, 256)
Read time: 1.735131025314331
Image: /datasets/biomassters/train_features/982bf942_S2_00.tif,  Shape: (11, 256, 256)
Read time: 1.5970118045806885
Image: /datasets/biomassters/train_features/08ecfd3c_S2_06.tif,  Shape: (11, 256, 256)
Read time: 1.4642102718353271
Image: /datasets/biomassters/train_features/5d1ae03b_S1_08.tif,  Shape: (4, 256, 256)
Read time: 1.377194881439209
Image: /datasets/biomassters/train_features/23e96184_S1_05.tif,  Shape: (4, 256, 256)
Read time: 1.8024892807006836
Image: /datasets/biomassters/train_features/ee4dbe3f_S1_01.tif,  Shape

In [14]:
# Reading sample image directly from the disk
total_time = 0.0
images = os.listdir("../data/sample_images/")
for img in images:
    img = os.path.join("../data/sample_images", img)
    start = time()
    with rasterio.open(img) as f:
        img_data = f.read()
        end = time()
        read_time = end - start
        total_time += read_time
        print(f"Image: {img},  Shape: {img_data.shape}")
        print(f"Read time: {read_time}")
print(f"Total read time for {len(random_images)} images at random = {total_time}")
print(f"Avg. time to read an image = {total_time / len(random_images)}")

Image: ../data/sample_images/001b0634_agbm.tif,  Shape: (1, 256, 256)
Read time: 0.0018391609191894531
Image: ../data/sample_images/001b0634_S2_00.tif,  Shape: (11, 256, 256)
Read time: 0.0052890777587890625
Image: ../data/sample_images/001b0634_S1_00.tif,  Shape: (4, 256, 256)
Read time: 0.002705097198486328
Total read time for 10 images at random = 0.009833335876464844
Avg. time to read an image = 0.0009833335876464843


## Access using S3FS Package

In [7]:
# Setup S3 URLs and folder locations within the S3 bucket
S3_URL = "s3://drivendata-competition-biomassters-public-us"
train_features_s3 = S3_URL + "/train_features/"
train_agbm_s3 = S3_URL + "/train_agbm/"
test_features_s3 = S3_URL + "/test_features/"

In [8]:
storage_options = {'anon': True}
fs = s3fs.S3FileSystem(**storage_options)

In [10]:
# List files with s3fs
start = time()
train_images_s3 = fs.ls(train_features_s3)
end = time()
print(f"Total images in training set: {len(train_images_s3)}")
print(f"Time taken: {end - start}")

Total images in training set: 189078
Time taken: 45.755274057388306


In [11]:
random_images_s3 = [train_images_s3[r] for r in random.choices(range(len(train_images_s3)), k=10)]

In [12]:
random_images_s3[0]

'drivendata-competition-biomassters-public-us/train_features/9357f679_S1_07.tif'

In [13]:
# Avg. read time for a random selection of images
total_read_time = 0.0
total_toarray_time = 0.0
for img in random_images_s3:
    # Read the raw bytes stream
    start = time()
    with fs.open(img) as f:
        raw_data = f.read()
        end = time()
        read_time = end - start
        total_read_time += read_time
        
        # Save bytes to an arry
        start = time()
        with MemoryFile(raw_data) as memfile:
            with memfile.open() as dataset:
                data_array = dataset.read()
                end = time()
                total_toarray_time += (end - start)
        print(f"Image: {img},  Shape: {data_array.shape}")
        print(f"Read time: {read_time}, To Array time: {end - start}")
print(f"Total read time for {len(random_images)} images at random = {total_read_time}")
print(f"Avg. time to read an image = {total_read_time / len(random_images)}")
print(f"Total ToArray time for {len(random_images)} images at random = {total_toarray_time}")
print(f"Avg. time to convert an image to array = {total_toarray_time / len(random_images)}")

Image: drivendata-competition-biomassters-public-us/train_features/9357f679_S1_07.tif,  Shape: (4, 256, 256)
Read time: 0.2608182430267334, To Array time: 0.0029358863830566406
Image: drivendata-competition-biomassters-public-us/train_features/4f95fe22_S1_05.tif,  Shape: (4, 256, 256)
Read time: 0.1986379623413086, To Array time: 0.003112316131591797
Image: drivendata-competition-biomassters-public-us/train_features/65533cf8_S1_01.tif,  Shape: (4, 256, 256)
Read time: 0.2255103588104248, To Array time: 0.002804994583129883
Image: drivendata-competition-biomassters-public-us/train_features/ff374b38_S1_04.tif,  Shape: (4, 256, 256)
Read time: 0.17055678367614746, To Array time: 0.0024361610412597656
Image: drivendata-competition-biomassters-public-us/train_features/c5fbf994_S1_10.tif,  Shape: (4, 256, 256)
Read time: 0.22671818733215332, To Array time: 0.002810955047607422
Image: drivendata-competition-biomassters-public-us/train_features/afd2b531_S2_01.tif,  Shape: (11, 256, 256)
Read t

In [11]:
start = time()
with fs.open(feat_filename) as f:
    raw_data = f.read()
end = time()
print(f"S3FS read time = {end - start}")

S3FS read time = 0.24098920822143555
