# Experiment Mounting S3 Bucket as Paperspace Gradient Data Source

In [1]:
!pip install --upgrade rasterio

Collecting rasterio
  Downloading rasterio-1.3.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting affine
  Downloading affine-2.3.1-py2.py3-none-any.whl (16 kB)
Collecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting snuggs>=1.4.1
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Installing collected packages: affine, snuggs, cligj, click-plugins, rasterio
Successfully installed affine-2.3.1 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.3.4 snuggs-1.4.7
[0m

In [20]:
import os
from pathlib import Path
import random
from time import time
import warnings

import pandas as pd
import rasterio

In [21]:
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)

In [22]:
DATASET_DIR = Path("/datasets/biomassters")
features_metadata_csv = DATASET_DIR / "features_metadata.csv"
train_agbm_csv = DATASET_DIR / "train_agbm_metadata.csv"
train_images_dir = DATASET_DIR / "train_features"
train_agbm_dir = DATASET_DIR / "train_agbm"
test_images_dir = DATASET_DIR / "test_features"

In [23]:
start = time()
train_images = os.listdir(train_images_dir)
end = time()
print(f"Total images in training set: {len(train_images)}")
print(f"Time taken: {end - start}")

Total images in training set: 189078
Time taken: 17.568349838256836


In [24]:
random_images = [train_images[r] for r in random.choices(range(len(train_images)), k=10)]

In [25]:
random_images

['0003d2eb_S2_07.tif',
 'e82fd76b_S1_11.tif',
 '37f6302f_S2_01.tif',
 '811f36cd_S2_06.tif',
 '778e2a1f_S1_07.tif',
 'b356e923_S1_02.tif',
 'd36d3fcf_S2_08.tif',
 'a9617118_S1_05.tif',
 'aae569ac_S2_02.tif',
 '0a5fa091_S1_11.tif']

In [26]:
from PIL import Image

In [27]:
total_time = 0.0
for img in random_images:
    img = train_images_dir / img
    with rasterio.open(img) as f:
        start = time()
        img_data = f.read()
        end = time()
        read_time = end - start
        total_time += read_time
        print(f"Image: {img},  Shape: {img_data.shape}")
        print(f"Read time: {read_time}")
print(f"Total read time for {len(random_images)} images at random = {total_time}")
print(f"Avg. time to read an image = {total_time / len(random_images)}")

Image: /datasets/biomassters/train_features/0003d2eb_S2_07.tif,  Shape: (11, 256, 256)
Read time: 0.08789443969726562
Image: /datasets/biomassters/train_features/e82fd76b_S1_11.tif,  Shape: (4, 256, 256)
Read time: 0.04330015182495117
Image: /datasets/biomassters/train_features/37f6302f_S2_01.tif,  Shape: (11, 256, 256)
Read time: 0.09347772598266602
Image: /datasets/biomassters/train_features/811f36cd_S2_06.tif,  Shape: (11, 256, 256)
Read time: 0.05720233917236328
Image: /datasets/biomassters/train_features/778e2a1f_S1_07.tif,  Shape: (4, 256, 256)
Read time: 0.047155141830444336
Image: /datasets/biomassters/train_features/b356e923_S1_02.tif,  Shape: (4, 256, 256)
Read time: 0.0463411808013916
Image: /datasets/biomassters/train_features/d36d3fcf_S2_08.tif,  Shape: (11, 256, 256)
Read time: 0.06080126762390137
Image: /datasets/biomassters/train_features/a9617118_S1_05.tif,  Shape: (4, 256, 256)
Read time: 0.05564999580383301
Image: /datasets/biomassters/train_features/aae569ac_S2_02.t