# Running fake data tests

In [31]:
%load_ext autoreload
%autoreload
import pandas as pd
import hvplot
import numpy as np
import os
pd.options.plotting.backend = 'holoviews'
import s3fs
import sys; sys.path.append('..')
import eodc_hub_role
import zarr_reader
import zarr_helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Setup 2: Setup data access

Fetch data from the fake data directory.

In [5]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
fake_data_dir = 'fake_data'
s3_fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)

## Test 1: Data with single chunk, varied chunk size

Hypothesis: Larger chunk sizes mean slower tile times, at all zoom levels.

### Step 1: Generate dataset specs

In [22]:
# Fake data directory
data_path = 'fake_data/single_chunk'
directories = s3_fs.ls(f'{bucket}/{data_path}')

datastore_specs = zarr_helpers.get_dataset_specs_from_directory(directories)

### Step 2: Inspect the dataset specs

In [23]:
df = pd.DataFrame.from_dict(datastore_specs, orient='index')
df.drop(columns=drop_columns).sort_values('chunk_size_mb')

Unnamed: 0,shape,lat_resolution,lon_resolution,chunk_size_mb,chunks,number_of_chunks
store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}",0.124395,0.124352,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",1
store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}",0.087934,0.087912,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",2
store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 2896, 'lon': 5792}",0.062176,0.062165,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",4
store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}",0.043956,0.043951,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",8
store_lat5793_lon11586.zarr,"{'time': 1, 'lat': 5793, 'lon': 11586}",0.031077,0.031075,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",16


### Step 3: Run the tests

In [None]:
%%time
zooms = range(12)
test_results = zarr_helpers.run_tests(datastore_specs, zooms)

#### (Optional) Inspect the results

In [19]:
df = pd.DataFrame.from_dict(
    datastore_specs,
    orient='index',
    columns=['chunk_size_mb', 'mean tile time', 'number_of_chunks']
).sort_values('chunk_size_mb')

df

Unnamed: 0,chunk_size_mb,mean tile time,number_of_chunks
store_lat1448_lon2896.zarr,31.993164,"{0: 2926.53, 1: 2562.335, 2: 2507.56, 3: 2554....",1
store_lat2048_lon4096.zarr,31.993164,"{0: 4870.98, 1: 3016.965, 2: 5398.635, 3: 1219...",2
store_lat2896_lon5792.zarr,31.993164,"{0: 7474.389999999999, 1: 4231.54, 2: 3708.015...",4
store_lat4096_lon8192.zarr,31.993164,"{0: 16449.635000000002, 1: 7389.46, 2: 4422.99...",8
store_lat5793_lon11586.zarr,31.993164,"{0: 27202.21, 1: 8153.5599999999995, 2: 8263.1...",16


### Step 4: Plot results

#### Time as function of chunk size

We see that as chunk size increases, so does time, with lower zoom levels seeing the most latency.

Question: Why is there so much variation in tile time at the larger chunk sizes? Is it only re-projecting a subset of the chunk at higher zoom levels?

In [15]:
# want to plot time as a function of chunksize, with multiple lines for each zoom

for zoom in range(zooms):
    df[zoom] = df['tile_times'].apply(lambda x: x[zoom])

zooms = list(map(str, range(len(xyz_tiles))))
df.plot.scatter(x='chunk_size_mb', y=zooms, value_label='Time in ms', group_label='zoom')

## Test 1: Data with multiple chunks

Hypothesis: More chunks, when chunk size is constant, results in slower tile times. Performance is much worse at lower zoom levels (zoom 0, 1, 2) because more chunks must be loaded to generate the tile.

### Step 1: Generate dataset specs

In [22]:
# Fake data directory
data_path = 'fake_data/with_chunks'
directories = s3_fs.ls(f'{bucket}/{data_path}')

datastore_specs = zarr_helpers.get_dataset_specs_from_directory(directories)

### Step 2: Inspect the dataset specs

In [23]:
df = pd.DataFrame.from_dict(datastore_specs, orient='index')
drop_columns = ['collection_name', 'variable', 'source', 'dtype', 'number_coord_chunks', 'compression']
df.drop(columns=drop_columns).sort_values('number_of_chunks')

Unnamed: 0,shape,lat_resolution,lon_resolution,chunk_size_mb,chunks,number_of_chunks
store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}",0.124395,0.124352,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",1
store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}",0.087934,0.087912,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",2
store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 2896, 'lon': 5792}",0.062176,0.062165,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",4
store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}",0.043956,0.043951,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",8
store_lat5793_lon11586.zarr,"{'time': 1, 'lat': 5793, 'lon': 11586}",0.031077,0.031075,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",16


### Step 3: Run the tests

In [34]:
%%time
niters = 2
zooms = range(12) # Zoom 10 is the level at which you can see large roads, 15 is buildings
test_results = zarr_helpers.run_tests(datastore_specs, zooms)

CPU times: user 1min 18s, sys: 25.2 s, total: 1min 44s
Wall time: 9min 9s


#### (Optional) Inspect the results

In [19]:
df = pd.DataFrame.from_dict(
    test_results,
    orient='index',
    columns=['chunk_size_mb', 'mean tile time', 'number_of_chunks']
).sort_values('chunk_size_mb')

df

Unnamed: 0,chunk_size_mb,mean tile time,number_of_chunks
store_lat1448_lon2896.zarr,31.993164,"{0: 2926.53, 1: 2562.335, 2: 2507.56, 3: 2554....",1
store_lat2048_lon4096.zarr,31.993164,"{0: 4870.98, 1: 3016.965, 2: 5398.635, 3: 1219...",2
store_lat2896_lon5792.zarr,31.993164,"{0: 7474.389999999999, 1: 4231.54, 2: 3708.015...",4
store_lat4096_lon8192.zarr,31.993164,"{0: 16449.635000000002, 1: 7389.46, 2: 4422.99...",8
store_lat5793_lon11586.zarr,31.993164,"{0: 27202.21, 1: 8153.5599999999995, 2: 8263.1...",16


### Step 4: Plot the results

#### Time as a function of number of chunks

Fewer chunks is means faster load times. Once you have 16 spatial chunks at 8MB, you see the time in ms at zoom 0 go to 339ms.

In [49]:
# want to plot time as a function of chunksize, with multiple lines for each zoom

for zoom in zooms:
    df[zoom] = df['mean tile time'].apply(lambda x: x[zoom])

zooms = list(map(str, range(len(zooms))))
df.plot.scatter(x='number_of_chunks', y=zooms, value_label='Time in ms', group_label='zoom')

#### Time as a function of zoom

At higher zooms, number of chunks doesn't matter (when chunks are the same size). At low zooms, more chunks means longer time to tile, which makes intuitive sense as more chunks need to be fetched to generate the tile.

In [50]:
df2 = df.drop(columns=['mean tile time']).melt(id_vars=['chunk_size_mb', 'number_of_chunks'], var_name='zoom')
df2.plot.scatter(x='zoom', y='value', by='number_of_chunks')

Once we determine what is the threshold for time, we should be able to determine if we need to chunk and / or pyramid.

In this scenario, if our threshold is 100ms, the chunk size should be 32mb. If the resolution of our data is such that we have more than 2 chunks of that size, we should pyramid the data for zoom levels 0, 1 and 2.