# Experiments with `fsspec`

In [3]:
import os
from pathlib import Path
from time import time

import fsspec
import s3fs

In [7]:
import rasterio

In [4]:
# Setup S3 URLs and folder locations within the S3 bucket
S3_URL = "s3://drivendata-competition-biomassters-public-us"
train_features_s3 = S3_URL + "/train_features/"
train_agbm_s3 = S3_URL + "/train_agbm/"
test_features_s3 = S3_URL + "/test_features/"

In [14]:
storage_options = {'anon': True}
fs = s3fs.S3FileSystem(**storage_options)

In [27]:
# Check filter_img
feat_filename = "s3://drivendata-competition-biomassters-public-us/train_features/0003d2eb_S1_00.tif"
s2feat_filename = "s3://drivendata-competition-biomassters-public-us/train_features/0003d2eb_S2_00.tif"
agbm_filename = "s3://drivendata-competition-biomassters-public-us/train_agbm/0003d2eb_agbm.tif"

In [11]:
start = time()
with fs.open(feat_filename) as f:
    raw_data = f.read()
end = time()
print(f"S3FS read time = {end - start}")

S3FS read time = 0.24098920822143555


In [28]:
start = time()
with fs.open(s2feat_filename) as f:
    s2raw_data = f.read()
end = time()
print(f"S3FS read time = {end - start}")

S3FS read time = 0.26908254623413086


In [16]:
start = time()
train_files = fs.ls("s3://drivendata-competition-biomassters-public-us/train_agbm")
end = time()
print(f"total files: {len(train_files)}")
print(f"time: {end - start}")

total files: 8689
time: 1.7435684204101562


In [18]:
start = time()
with fs.open(train_files[0]) as f:
    agbm_data = f.read()
end = time()
print(f"s3 read time: {end - start}")

s3 read time: 0.14272499084472656


In [29]:
from rasterio.io import MemoryFile

with MemoryFile(s2raw_data) as memfile:
    with memfile.open() as dataset:
        data_array = dataset.read()

In [30]:
data_array.shape

(11, 256, 256)

In [17]:
fs, path = fsspec.core.url_to_fs(S3_URL, **storage_options)

In [18]:
fs.isdir(path)

True

In [19]:
files = fs.ls(path)

In [21]:
files

['drivendata-competition-biomassters-public-us/features_metadata.csv',
 'drivendata-competition-biomassters-public-us/test_features',
 'drivendata-competition-biomassters-public-us/train_agbm',
 'drivendata-competition-biomassters-public-us/train_agbm_metadata.csv',
 'drivendata-competition-biomassters-public-us/train_features']

In [22]:
import time

In [23]:
s1 = time.time()

In [24]:
time.time() - s1

7.624447822570801