# Test 2: Different number of chunks and same chunk size

In [41]:
%load_ext autoreload
%autoreload
import pandas as pd
import hvplot
import morecantile
import numpy as np
import os
pd.options.plotting.backend = 'holoviews'
import random
import sys
sys.path.append('..')
from profiler.main import Timer
import zarr_reader
import zarr_helpers
from titiler_xarray.titiler.xarray.reader import ZarrReader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Define tiles to test

In [42]:
tms = morecantile.tms.get("WebMercatorQuad")
zooms = range(12) # Zoom 10 is the level at which you can see large roads, 15 is buildings

def generate_random_tile(z):
    random_lat = random.randint(-85, 85)
    random_lon = random.randint(-175, 175)
    tile = tms.tile(random_lon, random_lat, z)
    return (tile.x, tile.y, tile.z)

## Define data location

Local for now.

In [44]:
# Fake data directory
fake_data_dir = 'fake_data_with_chunks'

# List all items in the directory
items = os.listdir(fake_data_dir)

# Initiate a dictionary to store dataset information
datastore_specs = {}
variable = 'data'
# Loop through each item and open it with xarray if it's a Zarr store
for item in items:
    item_path = os.path.join(fake_data_dir, item)
    # Check if the item is a directory (Zarr stores are directories)
    if os.path.isdir(item_path):
        try:
            # get the dataset specs
            ds = zarr_helpers.open_dataset(item_path)
            dataset_name = item_path.split('/')[-1]
            ds_specs = zarr_helpers.get_dataset_specs(dataset_name, item_path, variable, ds)            
            datastore_specs[dataset_name] = ds_specs
            # product of all dimensions / 
            number_of_chunks = round(np.prod(list(ds_specs['shape'].values())) / np.prod(list(ds_specs['chunks'].values())))
            ds_specs['number_of_chunks'] = number_of_chunks
        except Exception as e:
            # Print an error message if unable to open the Zarr store
            print(f"Could not open {item} as a Zarr store. Error: {e}")

## Inspect the dataset specs

In [45]:
df = pd.DataFrame.from_dict(datastore_specs, orient='index')
df.drop(
    columns=['collection_name', 'variable', 'source', 'dtype', 'number_coord_chunks', 'compression']
).sort_values('chunk_size_mb')

Unnamed: 0,shape,lat_resolution,lon_resolution,chunk_size_mb,chunks,number_of_chunks
store_lat_1448x_lon_2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}",0.124395,0.124352,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",1
store_lat_5792x_lon_11585.zarr,"{'time': 1, 'lat': 5792, 'lon': 11585}",0.031083,0.031077,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",16
store_lat_4096x_lon_8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}",0.043956,0.043951,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",8
store_lat_2896x_lon_5793.zarr,"{'time': 1, 'lat': 2896, 'lon': 5793}",0.062176,0.062155,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",4
store_lat_2048x_lon_4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}",0.087934,0.087912,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",2


## Define a function for timing tile generation

In [46]:
def time_tile_generation(zoom: tuple, source: str, variable: str):
    x, y, z = generate_random_tile(zoom)
    with Timer() as t:
        with ZarrReader(
            source,
            variable=variable,
            reference=False
        ) as src_dst:
            image = src_dst.tile(
                x,
                y,
                z,
                tilesize=256,
            )
    return round(t.elapsed * 1000, 2) 

In [47]:
%%time
niters = 10
for item in items:
    datastore_specs[item]['all tile times'] = {}
    datastore_specs[item]['mean tile time'] = {}    
    for zoom in zooms:
        tests_key = f'{zoom} tests'
        datastore_specs[item]['all tile times'][tests_key] = []
        test_results = datastore_specs[item]['all tile times'][tests_key]
        for i in range(niters):
            tile_time = time_tile_generation(zoom, f'{fake_data_dir}/{item}', variable)
            test_results.append(tile_time)
        datastore_specs[item]['mean tile time'][zoom] = np.mean(test_results)

CPU times: user 1min 33s, sys: 21.3 s, total: 1min 54s
Wall time: 1min 3s


In [48]:
df = pd.DataFrame.from_dict(
    datastore_specs,
    orient='index',
    columns=['chunk_size_mb', 'mean tile time', 'number_of_chunks']
).sort_values('chunk_size_mb')

df

Unnamed: 0,chunk_size_mb,mean tile time,number_of_chunks
store_lat_1448x_lon_2896.zarr,31.993164,"{0: 79.355, 1: 57.962, 2: 53.854, 3: 50.73, 4:...",1
store_lat_5792x_lon_11585.zarr,31.993164,"{0: 1471.502, 1: 302.49, 2: 168.05400000000003...",16
store_lat_4096x_lon_8192.zarr,31.993164,"{0: 718.789, 1: 188.92800000000003, 2: 110.320...",8
store_lat_2896x_lon_5793.zarr,31.993164,"{0: 244.523, 1: 99.80900000000001, 2: 66.649, ...",4
store_lat_2048x_lon_4096.zarr,31.993164,"{0: 153.298, 1: 78.181, 2: 77.5, 3: 58.2109999...",2


## Time as a function of number of chunks

Fewer chunks is means faster load times. Once you have 16 spatial chunks at 8MB, you see the time in ms at zoom 0 go to 339ms.

In [49]:
# want to plot time as a function of chunksize, with multiple lines for each zoom

for zoom in zooms:
    df[zoom] = df['mean tile time'].apply(lambda x: x[zoom])

zooms = list(map(str, range(len(zooms))))
df.plot.scatter(x='number_of_chunks', y=zooms, value_label='Time in ms', group_label='zoom')

# Time as a function of zoom

At higher zooms, number of chunks doesn't matter (when chunks are the same size). At low zooms, more chunks means longer time to tile, which makes intuitive sense as more chunks need to be fetched to generate the tile.

In [50]:
df2 = df.drop(columns=['mean tile time']).melt(id_vars=['chunk_size_mb', 'number_of_chunks'], var_name='zoom')
df2.plot.scatter(x='zoom', y='value', by='number_of_chunks')

Once we determine what is the threshold for time, we should be able to determine if we need to chunk and / or pyramid.

In this scenario, if our threshold is 100ms, the chunk size should be 32mb. If the resolution of our data is such that we have more than 2 chunks of that size, we should pyramid the data for zoom levels 0, 1 and 2.