# Test 1: Different chunk size, no spatial chunking

In [16]:
%load_ext autoreload
%autoreload
import pandas as pd
import hvplot
import morecantile
import os
pd.options.plotting.backend = 'holoviews'
import sys
sys.path.append('..')
from profiler.main import Timer
import zarr_reader
import zarr_helpers
from titiler_xarray.titiler.xarray.reader import ZarrReader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Define tiles to test

In [18]:
tms = morecantile.tms.get("WebMercatorQuad")
zooms = range(12) # Zoom 10 is the level at which you can see large roads, 15 is buildings
xyz_tiles = []
for z in zooms:
    tile = tms.tile(57, 28, z)
    xyz_tiles.append((tile.x, tile.y, tile.z))

xyz_tiles

[(0, 0, 0),
 (1, 0, 1),
 (2, 1, 2),
 (5, 3, 3),
 (10, 6, 4),
 (21, 13, 5),
 (42, 26, 6),
 (84, 53, 7),
 (168, 107, 8),
 (337, 214, 9),
 (674, 428, 10),
 (1348, 857, 11)]

## Define data location

Local for now.

In [10]:
# Fake data directory
fake_data_dir = 'fake_data_no_chunks'

# List all items in the directory
items = os.listdir(fake_data_dir)

# Initiate a dictionary to store dataset information
datastore_specs = {}
variable = 'data'
# Loop through each item and open it with xarray if it's a Zarr store
for item in items:
    item_path = os.path.join(fake_data_dir, item)
    # Check if the item is a directory (Zarr stores are directories)
    if os.path.isdir(item_path):
        try:
            # get the dataset specs
            ds = zarr_helpers.open_dataset(item_path)
            dataset_name = item_path.split('/')[-1]
            ds_specs = zarr_helpers.get_dataset_specs(dataset_name, item_path, variable, ds)            
            datastore_specs[dataset_name] = ds_specs
        except Exception as e:
            # Print an error message if unable to open the Zarr store
            print(f"Could not open {item} as a Zarr store. Error: {e}")

## Inspect the dataset specs

In [11]:
df = pd.DataFrame.from_dict(datastore_specs, orient='index')
df.drop(
    columns=['collection_name', 'variable', 'source', 'dtype', 'number_coord_chunks', 'compression']
).sort_values('chunk_size_mb')

Unnamed: 0,shape,lat_resolution,lon_resolution,chunk_size_mb,chunks
store_lat_512x_lon_1024.zarr,"{'time': 1, 'lat': 512, 'lon': 1024}",0.35225,0.351906,4.0,"{'time': 1, 'lat': 512, 'lon': 1024}"
store_lat_724x_lon_1448.zarr,"{'time': 1, 'lat': 724, 'lon': 1448}",0.248963,0.248791,7.998291,"{'time': 1, 'lat': 724, 'lon': 1448}"
store_lat_1024x_lon_2048.zarr,"{'time': 1, 'lat': 1024, 'lon': 2048}",0.175953,0.175867,16.0,"{'time': 1, 'lat': 1024, 'lon': 2048}"
store_lat_1448x_lon_2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}",0.124395,0.124352,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}"
store_lat_2048x_lon_4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}",0.087934,0.087912,64.0,"{'time': 1, 'lat': 2048, 'lon': 4096}"
store_lat_2896x_lon_5793.zarr,"{'time': 1, 'lat': 2896, 'lon': 5793}",0.062176,0.062155,127.994751,"{'time': 1, 'lat': 2896, 'lon': 5793}"
store_lat_4096x_lon_8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}",0.043956,0.043951,256.0,"{'time': 1, 'lat': 4096, 'lon': 8192}"


In [5]:
## Define a function for timing tile generation

In [12]:
def time_tile_generation(tileset: tuple, source: str, variable: str):
    x, y, z = tileset
    with Timer() as t:
        with ZarrReader(
            source,
            variable=variable,
            reference=False
        ) as src_dst:
            image = src_dst.tile(
                x,
                y,
                z,
                tilesize=256,
            )
    return round(t.elapsed * 1000, 2) 

In [13]:
for item in items:
    datastore_specs[item]['tile_times'] = {}
    for xyz_tile in xyz_tiles:
        datastore_specs[item]['tile_times'][xyz_tile[2]] = time_tile_generation(xyz_tile, f'{fake_data_dir}/{item}', variable)

In [14]:
df = pd.DataFrame.from_dict(
    datastore_specs,
    orient='index',
    columns=['chunk_size_mb', 'tile_times']
).sort_values('chunk_size_mb')

df

Unnamed: 0,chunk_size_mb,tile_times
store_lat_512x_lon_1024.zarr,4.0,"{0: 38.3, 1: 33.15, 2: 34.45, 3: 29.95, 4: 32...."
store_lat_724x_lon_1448.zarr,7.998291,"{0: 59.05, 1: 53.94, 2: 39.02, 3: 36.12, 4: 34..."
store_lat_1024x_lon_2048.zarr,16.0,"{0: 68.21, 1: 47.52, 2: 46.48, 3: 43.55, 4: 45..."
store_lat_1448x_lon_2896.zarr,31.993164,"{0: 115.13, 1: 70.04, 2: 60.84, 3: 55.33, 4: 5..."
store_lat_2048x_lon_4096.zarr,64.0,"{0: 180.0, 1: 93.77, 2: 75.08, 3: 85.06, 4: 69..."
store_lat_2896x_lon_5793.zarr,127.994751,"{0: 293.86, 1: 168.31, 2: 120.91, 3: 106.56, 4..."
store_lat_4096x_lon_8192.zarr,256.0,"{0: 742.39, 1: 346.62, 2: 305.56, 3: 257.71, 4..."


# Time as function of chunk size

We see that as chunk size increases, so does time, with lower zoom levels seeing the most latency.

Question: Why is there so much variation in tile time at the larger chunk sizes? Is it only re-projecting a subset of the chunk at higher zoom levels?

In [15]:
# want to plot time as a function of chunksize, with multiple lines for each zoom

for tile in xyz_tiles:
    zoom = tile[2]
    df[zoom] = df['tile_times'].apply(lambda x: x[zoom])

zooms = list(map(str, range(len(xyz_tiles))))
df.plot.scatter(x='chunk_size_mb', y=zooms, value_label='Time in ms', group_label='zoom')