# Exploring TorchData for Streaming Data from an AWS S3 Bucket

This notebook explores using the [TorchData](https://pytorch.org/data/beta/index.html) package for setting up data pipelines and for using with cloud storage, which in this case happens to be an Amazon S3 bucket.

**Monday 5th December 2022**

I realized that the Zipper datapipe wasn't working probably due to the filtering on the input features datapipe, which could potentially cause issues with yielding.
A simpler solution might be to use a single input feature datapipe, list files, filter files and map a function to returns both the input image data and the corresponding output image data along with the chip ID as a dict.

## Setup and Metadata

In [18]:
# Imports

from pathlib import Path
import time
import warnings

import numpy as np
import rasterio
import torch
from torch import Tensor
import torchdata.datapipes.iter as pipes
from torchdata.datapipes.iter import IterableWrapper

In [19]:
from torch.utils.data import DataLoader

In [20]:
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)

In [21]:
# Setup folder locations
# The public S3 bucket is mounted as a data source
# for the current notebook instance
S3_DATA_SOURCE = Path("/datasets/biomassters")
train_features_dir = S3_DATA_SOURCE / "train_features/"
train_agbm_dir = S3_DATA_SOURCE / "train_agbm/"
test_features_dir = S3_DATA_SOURCE / "test_features/"

In [22]:
# Sanity check
print(train_features_dir)
print(train_agbm_dir)

/datasets/biomassters/train_features
/datasets/biomassters/train_agbm


## Utilities

In [23]:
# Test code for using Path object to parse out information
def parse_filename(filename):
    filename = Path(filename)
    parsed_filename = {
        "Name": filename.name,
        "Stem": filename.stem,
        "Suffix": filename.suffix,
        "Chip ID": filename.stem.split('_')[0]
    }
    
    return parsed_filename

In [24]:
# Check parse_filename
filename = "/datasets/biomassters/train_features/0003d2eb_S1_00.tif"
parsed_filename = parse_filename(filename)
print(parsed_filename)

{'Name': '0003d2eb_S1_00.tif', 'Stem': '0003d2eb_S1_00', 'Suffix': '.tif', 'Chip ID': '0003d2eb'}


In [25]:
# Function to filter filenames based on pre-determined satellite and month
def filter_img(filename, satellite='S2', month='07'):
    file_path = Path(filename)
    chip_id = file_path.stem.split("_")[0]
    
    filter_img = f"{chip_id}_{satellite}_{month}.tif"
    return file_path.name == filter_img

In [26]:
# Check filter_img
filename = Path("/datasets/biomassters/train_features/0003d2eb_S1_00.tif")
print(filter_img(filename))
filename = Path("/datasets/biomassters/train_features/0003d2eb_S2_07.tif")
print(filter_img(filename))

False
True


In [27]:
def load_raster(filename: str) -> Tensor:
    with rasterio.open(filename) as f:
        array = f.read()
        if array.dtype == np.uint16:
            array = array.astype(np.int32)
        return filename, torch.from_numpy(array)

In [28]:
# Check load_raster
filename = "/datasets/biomassters/train_features/0003d2eb_S2_07.tif"
start = time.time()
file_url, tensor_data = load_raster(filename)
end = time.time()
print(f"File: {file_url} Data Shape: {tensor_data.shape}"
      f"Read Time: {end - start}")

File: /datasets/biomassters/train_features/0003d2eb_S2_07.tif Data Shape: torch.Size([11, 256, 256])Read Time: 19.369688510894775


## Datapipes

In [29]:
features_dp = IterableWrapper([train_features_dir]).list_files()
features_dp = features_dp.filter(filter_fn=filter_img)

In [30]:
features_dp = features_dp.map(load_raster)

In [45]:
start = time.time()
feat_batch = next(iter(features_dp))
end = time.time()
print(f"Total time: {end - start}")

Total time: 18.809818506240845


In [31]:
agbm_dp = IterableWrapper([train_agbm_dir])
agbm_dp = agbm_dp.list_files()

In [32]:
agbm_dp = agbm_dp.map(load_raster)

In [35]:
start = time.time()
batch = next(iter(agbm_dp))
end = time.time()
print(f"Total time: {end - start}")

Total time: 2.0122971534729004


In [44]:
agbm_url, agbm_data = batch
print(agbm_url)
print(agbm_data.shape)

/datasets/biomassters/train_agbm/0003d2eb_agbm.tif
torch.Size([1, 256, 256])


In [33]:
input_dp = features_dp.zip(agbm_dp).batch(1)

In [34]:
start = time.time()
first_set = next(iter(input_dp))
end = time.time()
print(f"Total time: {end - start}")

Exception ignored in: <generator object ZipperIterDataPipe.__iter__ at 0x7fdcc2486270>
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/datapipes/iter/combining.py", line 546, in __iter__
    unused += list(iterator)
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/datapipes/_hook_iterator.py", line 185, in wrap_generator
    response = gen.send(request)
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/datapipes/iter/callable.py", line 123, in __iter__
    yield self._apply_fn(data)
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/data/datapipes/iter/callable.py", line 88, in _apply_fn
    return self.fn(data)
  File "/tmp/ipykernel_73/3582365464.py", line 2, in load_raster
  File "/usr/local/lib/python3.9/dist-packages/rasterio/env.py", line 444, in wrapper
    return f(*args, **kwds)
  File "/usr/local/lib/python3.9/dist-packages/rasterio/__init__.py", line 304, in open
    dataset = DatasetRead

Total time: 115.42922520637512


In [42]:
# features_dp = features_dp.sharding_filter()
# features_dp = features_dp.open_files_by_fsspec(mode="rb")

# Note: Here also, using S3 specific function results in an error
# TypeError: s3_read(): incompatible function arguments. The following argument types are supported:
#    1. (self: torchdata._torchdata.S3Handler, arg0: str) -> bytes

# Invoked with: <torchdata._torchdata.S3Handler object at 0x7fb30498f030>, ('s3://drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_00.tif', StreamWrapper<<File-like object S3FileSystem, drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_00.tif>>)
# This exception is thrown by __iter__ of S3FileLoaderIterDataPipe(source_datapipe=ShardingFilterIterDataPipe)

# features_dp = features_dp.load_files_by_s3()

### Using `rasterio` instead of `PIL`

Using `PIL` for image reading and displaying doesn't work as it doesn't support TIFF format well (limited rather).

Instead I will use `rasterio` library for reading tif data. [Rasterio](https://rasterio.readthedocs.io/en/latest/index.html) is a package build specifically for Geospatial data.

In [43]:
from rasterio import MemoryFile

In [44]:
def read_to_array(data):
    url, file_obj = data
    raw_bytes = file_obj.read()
    
    with MemoryFile(raw_bytes) as memfile:
        try:
            with memfile.open() as dataset:
                raw_bytes = dataset.read(list(range(1, dataset.count+1)))
        except rasterio.errors.NotGeoreferencedWarning:
            pass
        return (url, raw_bytes)

In [45]:
# feat_it = next(iter(features_dp))
# feat_url, feat_data = read_to_array(feat_it)
# print(feat_url)
# print(feat_data.shape)

In [49]:
# agbm_dp = agbm_dp.sharding_filter()
agbm_dp = agbm_dp.open_files_by_fsspec(mode="rb")

In [50]:
# agbm_it = next(iter(agbm_dp))
# agbm_url, agbm_data = read_to_array(agbm_it)
# print(agbm_url)
# print(agbm_data.shape)

In [46]:
# features_dp = features_dp.map(read_to_array)
# agbm_dp = agbm_dp.map(read_to_array)

In [23]:
dl = DataLoader(dataset=input_dp, batch_size=5, num_workers=2)

In [24]:
# THIS STEP TAKES A REALLY LONG TIME!!! DOESN'T SEEM RIGHT...
# first_batch = next(iter(dl))

In [48]:
import s3fs

In [49]:
fs = s3fs.S3FileSystem(anon=True)
fs.ls(S3_URL)

['drivendata-competition-biomassters-public-us/features_metadata.csv',
 'drivendata-competition-biomassters-public-us/test_features',
 'drivendata-competition-biomassters-public-us/train_agbm',
 'drivendata-competition-biomassters-public-us/train_agbm_metadata.csv',
 'drivendata-competition-biomassters-public-us/train_features']

In [50]:
fs.

In [27]:
feat_images = fs.ls(train_features_s3)

In [28]:
len(feat_images)

189078

In [29]:
feat_images[:100]

['drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_00.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_01.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_02.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_03.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_04.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_05.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_06.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_07.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_08.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_09.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_10.tif',
 'drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_11.tif',
 'dr