# Running fake data tests

In [1]:
%load_ext autoreload
%autoreload
import pandas as pd
import hvplot
import json
import numpy as np
import os
import statsmodels.api as sm
pd.options.plotting.backend = 'holoviews'
import s3fs
import sys; sys.path.append('..');
import eodc_hub_role
import zarr_reader
import zarr_helpers

In [None]:
#%%capture
#!pip install pydantic==1.10.9 rio-tiler==4.1.11 loguru 
#!pip install pydantic==1.10.9

## Setup 2: Setup data access

Fetch data from the fake data directory.

In [2]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
fake_data_dir = 'fake_data'
s3_fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)

## Test 1: Data with single chunk, varied chunk size

Hypothesis: Larger chunk sizes mean slower tile times, at all zoom levels.

### Step 1: Generate dataset specs

In [None]:
# Fake data directory
data_path = 'fake_data/single_chunk'
directories = s3_fs.ls(f'{bucket}/{data_path}')
data_path = 'fake_data/with_chunks'
directories.extend(s3_fs.ls(f'{bucket}/{data_path}'))
datastore_specs = zarr_helpers.get_dataset_specs_from_directory(directories)

### Step 2: Inspect the dataset specs

In [None]:
df = pd.DataFrame.from_dict(datastore_specs, orient='index')
drop_columns = ['collection_name', 'variable', 'source', 'dtype', 'number_coord_chunks', 'compression']
df.drop(columns=drop_columns).sort_values(['number_of_chunks', 'chunk_size_mb'])

### Step 3: Run the tests

In [5]:
zooms = range(12)

In [None]:
%%time
# by default tests are only run twice
test_results = zarr_helpers.run_tests(datastore_specs, zooms, niters=10)

In [2]:
# def json_serializable(z):
#     if type(z) == np.float64:
#         return float(z)

# with open('test_results.json', 'w') as f:
#     f.write(json.dumps(test_results, default=json_serializable))
#     f.close()
test_results = json.loads(open('test_results.json', 'r').read())

#### (Optional) Inspect the results

In [3]:
df = pd.DataFrame.from_dict(
    test_results,
    orient='index',
    # call also add 'all tile times'
    columns=['chunk_size_mb', 'mean tile time', 'number_of_chunks']
).sort_values('chunk_size_mb')
df

Unnamed: 0,chunk_size_mb,mean tile time,number_of_chunks
single_chunk/store_lat512_lon1024.zarr,4.0,"{'0': 191.524, '1': 144.714, '2': 141.473, '3'...",1
single_chunk/store_lat724_lon1448.zarr,7.998291,"{'0': 215.281, '1': 167.673, '2': 178.08499999...",1
single_chunk/store_lat1024_lon2048.zarr,16.0,"{'0': 290.255, '1': 257.723, '2': 270.296, '3'...",1
single_chunk/store_lat1448_lon2896.zarr,31.993164,"{'0': 441.639, '1': 414.13499999999993, '2': 3...",1
with_chunks/store_lat1448_lon2896.zarr,31.993164,"{'0': 494.775, '1': 408.248, '2': 422.66900000...",1
with_chunks/store_lat2048_lon4096.zarr,31.993164,"{'0': 533.036, '1': 420.467, '2': 302.993, '3'...",2
with_chunks/store_lat2896_lon5792.zarr,31.993164,"{'0': 758.4399999999999, '1': 453.621999999999...",4
with_chunks/store_lat4096_lon8192.zarr,31.993164,"{'0': 1227.561, '1': 510.713, '2': 405.757, '3...",8
with_chunks/store_lat5793_lon11586.zarr,31.993164,"{'0': 2493.807, '1': 723.915, '2': 507.5659999...",16
single_chunk/store_lat2048_lon4096.zarr,64.0,"{'0': 910.0799999999999, '1': 753.287, '2': 74...",1


### Step 4: Plot results

#### Time as function of chunk size

We see that as chunk size increases, so does time, with lower zoom levels seeing the most latency.

Question: Why is there so much variation in tile time at the larger chunk sizes? Is it only re-projecting a subset of the chunk at higher zoom levels?

In [6]:
# want to plot time as a function of chunksize, with multiple lines for each zoom
zooms_as_str = list(map(str, zooms))
for zoom in zooms:
    df[zoom] = df['mean tile time'].apply(lambda x: x[str(zoom)])

In [7]:
df

Unnamed: 0,chunk_size_mb,mean tile time,number_of_chunks,0,1,2,3,4,5,6,7,8,9,10,11
single_chunk/store_lat512_lon1024.zarr,4.0,"{'0': 191.524, '1': 144.714, '2': 141.473, '3'...",1,191.524,144.714,141.473,133.139,148.317,138.334,138.144,136.732,145.535,149.723,144.834,159.163
single_chunk/store_lat724_lon1448.zarr,7.998291,"{'0': 215.281, '1': 167.673, '2': 178.08499999...",1,215.281,167.673,178.085,187.109,194.751,181.003,190.828,169.757,180.015,200.999,194.751,171.07
single_chunk/store_lat1024_lon2048.zarr,16.0,"{'0': 290.255, '1': 257.723, '2': 270.296, '3'...",1,290.255,257.723,270.296,263.358,281.194,272.016,294.778,283.838,256.292,298.188,250.796,345.487
single_chunk/store_lat1448_lon2896.zarr,31.993164,"{'0': 441.639, '1': 414.13499999999993, '2': 3...",1,441.639,414.135,397.107,402.457,402.822,393.86,405.924,411.027,404.855,393.224,394.332,406.127
with_chunks/store_lat1448_lon2896.zarr,31.993164,"{'0': 494.775, '1': 408.248, '2': 422.66900000...",1,494.775,408.248,422.669,406.002,399.753,404.1,403.2,401.06,404.556,402.224,407.369,402.059
with_chunks/store_lat2048_lon4096.zarr,31.993164,"{'0': 533.036, '1': 420.467, '2': 302.993, '3'...",2,533.036,420.467,302.993,308.292,321.281,275.775,316.881,330.839,335.281,324.514,354.245,349.817
with_chunks/store_lat2896_lon5792.zarr,31.993164,"{'0': 758.4399999999999, '1': 453.621999999999...",4,758.44,453.622,416.912,417.549,418.513,409.615,412.941,430.221,406.547,416.768,412.356,403.562
with_chunks/store_lat4096_lon8192.zarr,31.993164,"{'0': 1227.561, '1': 510.713, '2': 405.757, '3...",8,1227.561,510.713,405.757,389.628,381.954,363.669,379.325,379.753,379.793,372.423,384.726,371.212
with_chunks/store_lat5793_lon11586.zarr,31.993164,"{'0': 2493.807, '1': 723.915, '2': 507.5659999...",16,2493.807,723.915,507.566,421.024,413.175,413.206,405.155,446.403,429.853,404.926,412.599,410.937
single_chunk/store_lat2048_lon4096.zarr,64.0,"{'0': 910.0799999999999, '1': 753.287, '2': 74...",1,910.08,753.287,744.769,740.93,738.999,748.405,743.32,738.035,743.005,740.86,762.834,757.947


In [8]:
# Create a boolean mask where the index contains the substring 'single_chunk'
mask = df.index.to_series().str.contains('single_chunk', case=False)
# Use the boolean mask to filter the DataFrame
single_chunk_filtered_df = df[mask]
single_chunk_filtered_df.plot.scatter(x='chunk_size_mb', y=zooms_as_str, value_label='Time in ms', group_label='zoom')

#### Time as a function of number of chunks

Fewer chunks is means faster load times. Once you have 16 spatial chunks at 8MB, you see the time in ms at zoom 0 go to 339ms.

In [9]:
# Create a boolean mask where the index contains the substring 'single_chunk'
mask = df.index.to_series().str.contains('with_chunks', case=False)
# Use the boolean mask to filter the DataFrame
multiple_chunks_filtered_df = df[mask]
multiple_chunks_filtered_df.plot.scatter(x='number_of_chunks', y=zooms_as_str, value_label='Time in ms', group_label='zoom')

#### Time as a function of zoom

At higher zooms, number of chunks doesn't matter (when chunks are the same size). At low zooms, more chunks means longer time to tile, which makes intuitive sense as more chunks need to be fetched to generate the tile.

For a given chunk size, performance across zoom levels is roughly constant, where as for a given number of chunks, performance is significantly worse at lower zoom levels than higher zoom levels as the number of chunks increases.

In [23]:
single_chunk_filtered_df_melted = single_chunk_filtered_df.drop(columns=['mean tile time']).melt(id_vars=['chunk_size_mb', 'number_of_chunks'], var_name='zoom')
single_chunk_filtered_df_melted.plot.scatter(x='zoom', y='value', by='chunk_size_mb')

In [11]:
multiple_chunks_filtered_df_melted = multiple_chunks_filtered_df.drop(columns=['mean tile time']).melt(id_vars=['chunk_size_mb', 'number_of_chunks'], var_name='zoom')
multiple_chunks_filtered_df_melted.plot.scatter(x='zoom', y='value', by='number_of_chunks')

In [8]:
df_melted = df.drop(columns=['mean tile time']).melt(id_vars=['chunk_size_mb', 'number_of_chunks'], var_name='zoom')
df_melted

Unnamed: 0,chunk_size_mb,number_of_chunks,zoom,value
0,4.000000,1,0,191.524
1,7.998291,1,0,215.281
2,16.000000,1,0,290.255
3,31.993164,1,0,441.639
4,31.993164,1,0,494.775
...,...,...,...,...
139,31.993164,8,11,371.212
140,31.993164,16,11,410.937
141,64.000000,1,11,757.947
142,127.972656,1,11,1386.373


## Model the relationship...

between time to tile (dependent variable) and independent variables (zoom, number of chunks and size of chunks).

In [9]:
# First, add an encoding for each zoom level
df_encoded = df_melted.copy()
for zoom in zooms:
    df_encoded[f'zoom {zoom}'] = np.where(df_melted['zoom'] == zoom, 1, 0)
    
#df_encoded['zoom'] = df_encoded['zoom'].astype(int)

In [27]:
df_encoded

Unnamed: 0,chunk_size_mb,number_of_chunks,zoom,value,zoom 0,zoom 1,zoom 2,zoom 3,zoom 4,zoom 5,zoom 6,zoom 7,zoom 8,zoom 9,zoom 10,zoom 11
0,4.000000,1,0,191.524,1,0,0,0,0,0,0,0,0,0,0,0
1,7.998291,1,0,215.281,1,0,0,0,0,0,0,0,0,0,0,0
2,16.000000,1,0,290.255,1,0,0,0,0,0,0,0,0,0,0,0
3,31.993164,1,0,441.639,1,0,0,0,0,0,0,0,0,0,0,0
4,31.993164,1,0,494.775,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,31.993164,8,11,371.212,0,0,0,0,0,0,0,0,0,0,0,1
140,31.993164,16,11,410.937,0,0,0,0,0,0,0,0,0,0,0,1
141,64.000000,1,11,757.947,0,0,0,0,0,0,0,0,0,0,0,1
142,127.972656,1,11,1386.373,0,0,0,0,0,0,0,0,0,0,0,1


Once we determine what is the threshold for time, we should be able to determine if we need to chunk and / or pyramid.

In this scenario, if our threshold is 100ms, the chunk size should be 32mb. If the resolution of our data is such that we have more than 2 chunks of that size, we should pyramid the data for zoom levels 0, 1 and 2.

#### What is the relationship between number and size of chunks across zooms?

In [10]:
ind_variables = ['number_of_chunks', 'chunk_size_mb']
ind_variables.extend([f'zoom {zoom}' for zoom in zooms])
print(ind_variables)

['number_of_chunks', 'chunk_size_mb', 'zoom 0', 'zoom 1', 'zoom 2', 'zoom 3', 'zoom 4', 'zoom 5', 'zoom 6', 'zoom 7', 'zoom 8', 'zoom 9', 'zoom 10', 'zoom 11']


In [11]:
# Create the design matrix
X = df_encoded[ind_variables]
y = df_encoded['value']

# Add constant
X_with_const = sm.add_constant(X)

# Create and fit the model
model = sm.OLS(y, X_with_const).fit()

# Display summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.952
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     197.6
Date:                Sat, 26 Aug 2023   Prob (F-statistic):           7.91e-79
Time:                        16:17:59   Log-Likelihood:                -933.80
No. Observations:                 144   AIC:                             1896.
Df Residuals:                     130   BIC:                             1937.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               63.5164     20.090  

In [16]:
values_to_predict = {
    'const': [1],
    'number_of_chunks': [1],
    'chunk_size_mb': [2]
}
zooms_to_predict = {f'zoom {zoom}': [0] for zoom in zooms}
zooms_to_predict['zoom 0'] = [1]
values_to_predict.update(zooms_to_predict)
print(values_to_predict)
model.predict(pd.DataFrame(values_to_predict))

{'const': [1], 'number_of_chunks': [1], 'chunk_size_mb': [2], 'zoom 0': [1], 'zoom 1': [0], 'zoom 2': [0], 'zoom 3': [0], 'zoom 4': [0], 'zoom 5': [0], 'zoom 6': [0], 'zoom 7': [0], 'zoom 8': [0], 'zoom 9': [0], 'zoom 10': [0], 'zoom 11': [0]}


0    461.718623
dtype: float64

In [14]:
#df.drop(columns=['mean tile time'])
import pickle

with open('tile_model.pkl', 'wb') as f:
    pickle.dump(model, f)