# Experiments with `fsspec`

In [3]:
import os
from pathlib import Path
from time import time

import fsspec
import s3fs

In [6]:
import rasterio

## Reading with `s3fs`

In [7]:
# Setup S3 URLs and folder locations within the S3 bucket
S3_URL = "s3://drivendata-competition-biomassters-public-us"
train_features_s3 = S3_URL + "/train_features/"
train_agbm_s3 = S3_URL + "/train_agbm/"
test_features_s3 = S3_URL + "/test_features/"

In [8]:
storage_options = {'anon': True}
fs = s3fs.S3FileSystem(**storage_options)

In [9]:
# Check filter_img
s1feat_filename = "s3://drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_00.tif"
s2feat_filename = "s3://drivendata-competition-biomassters-public-us/train_features/0003d2eb_S2_00.tif"
agbm_filename = "s3://drivendata-competition-biomassters-public-us/train_agbm/0003d2eb_agbm.tif"

In [11]:
start = time()
with fs.open(s1feat_filename) as f:
    raw_data = f.read()
end = time()
print(f"S3FS read time = {end - start}")

S3FS read time = 0.2901594638824463


In [12]:
start = time()
with fs.open(s2feat_filename) as f:
    s2raw_data = f.read()
end = time()
print(f"S3FS read time = {end - start}")

S3FS read time = 0.09549403190612793


In [13]:
start = time()
train_files = fs.ls("s3://drivendata-competition-biomassters-public-us/train_agbm")
end = time()
print(f"total files: {len(train_files)}")
print(f"time: {end - start}")

total files: 8689
time: 2.0048084259033203


In [14]:
start = time()
with fs.open(train_files[0]) as f:
    agbm_data = f.read()
end = time()
print(f"s3 read time: {end - start}")

s3 read time: 0.1815347671508789


In [15]:
from rasterio.io import MemoryFile

with MemoryFile(s2raw_data) as memfile:
    with memfile.open() as dataset:
        data_array = dataset.read()

  return DatasetReader(mempath, driver=driver, sharing=sharing, **kwargs)


In [16]:
data_array.shape

(11, 256, 256)

## Reading with `fsspec`

In [19]:
storage_options = {'anon': True}
start = time()
with fsspec.open(s1feat_filename, **storage_options).open() as f:
    s1raw = f.read()
    with MemoryFile(s1raw) as memfile:
        with memfile.open() as dataset:
            data_array = dataset.read()
            end = time()

In [20]:
print(f"read time with fsspec: {end - start}")
print(f"data shape: {data_array.shape}")

read time with fsspec: 0.20982980728149414
data shape: (4, 256, 256)
