# Experiment Mounting S3 Bucket as Paperspace Gradient Data Source

In [1]:
!pip install --upgrade rasterio

Collecting rasterio
  Downloading rasterio-1.3.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting affine
  Downloading affine-2.3.1-py2.py3-none-any.whl (16 kB)
Collecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting snuggs>=1.4.1
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Installing collected packages: affine, snuggs, cligj, click-plugins, rasterio
Successfully installed affine-2.3.1 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.3.4 snuggs-1.4.7
[0m

In [1]:
import os
from pathlib import Path
import random
from time import time
import warnings

import pandas as pd
import rasterio

In [2]:
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)

In [3]:
DATASET_DIR = Path("/datasets/biomassters")
features_metadata_csv = DATASET_DIR / "features_metadata.csv"
train_agbm_csv = DATASET_DIR / "train_agbm_metadata.csv"
train_images_dir = DATASET_DIR / "train_features"
train_agbm_dir = DATASET_DIR / "train_agbm"
test_images_dir = DATASET_DIR / "test_features"

In [4]:
start = time()
train_images = os.listdir(train_images_dir)
end = time()
print(f"Total images in training set: {len(train_images)}")
print(f"Time taken: {end - start}")

Total images in training set: 189078
Time taken: 19.380016326904297


In [5]:
random_images = [train_images[r] for r in random.choices(range(len(train_images)), k=10)]

In [6]:
random_images

['c5cc3c34_S2_06.tif',
 'ff4774a4_S2_00.tif',
 '5f7ec7c5_S2_06.tif',
 '0384b13c_S2_09.tif',
 'c80d39a2_S2_09.tif',
 'e44bc1a1_S2_01.tif',
 '5b711a4a_S2_09.tif',
 '004ef4ec_S1_04.tif',
 'a072b45b_S1_10.tif',
 '82cf7e7e_S1_06.tif']

In [26]:
from PIL import Image

In [8]:
total_time = 0.0
for img in random_images:
    img = train_images_dir / img
    start = time()
    with rasterio.open(img) as f:
        img_data = f.read()
        end = time()
        read_time = end - start
        total_time += read_time
        print(f"Image: {img},  Shape: {img_data.shape}")
        print(f"Read time: {read_time}")
print(f"Total read time for {len(random_images)} images at random = {total_time}")
print(f"Avg. time to read an image = {total_time / len(random_images)}")

Image: /datasets/biomassters/train_features/c5cc3c34_S2_06.tif,  Shape: (11, 256, 256)
Read time: 18.601293563842773
Image: /datasets/biomassters/train_features/ff4774a4_S2_00.tif,  Shape: (11, 256, 256)
Read time: 19.18782091140747
Image: /datasets/biomassters/train_features/5f7ec7c5_S2_06.tif,  Shape: (11, 256, 256)
Read time: 20.16826319694519
Image: /datasets/biomassters/train_features/0384b13c_S2_09.tif,  Shape: (11, 256, 256)
Read time: 17.85543203353882
Image: /datasets/biomassters/train_features/c80d39a2_S2_09.tif,  Shape: (11, 256, 256)
Read time: 17.9394748210907
Image: /datasets/biomassters/train_features/e44bc1a1_S2_01.tif,  Shape: (11, 256, 256)
Read time: 16.922703742980957
Image: /datasets/biomassters/train_features/5b711a4a_S2_09.tif,  Shape: (11, 256, 256)
Read time: 18.45120120048523
Image: /datasets/biomassters/train_features/004ef4ec_S1_04.tif,  Shape: (4, 256, 256)
Read time: 19.69817328453064
Image: /datasets/biomassters/train_features/a072b45b_S1_10.tif,  Shape: 