# Running fake data tests

In [2]:
%load_ext autoreload
%autoreload
import pandas as pd
import hvplot
import numpy as np
import os
pd.options.plotting.backend = 'holoviews'
import s3fs
import sys; sys.path.append('..');
import eodc_hub_role
import zarr_reader
import zarr_helpers

In [None]:
#%%capture
#!pip install pydantic==1.10.9 rio-tiler==4.1.11 loguru 
#!pip install pydantic==1.10.9

## Setup 2: Setup data access

Fetch data from the fake data directory.

In [3]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
fake_data_dir = 'fake_data'
s3_fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)

## Test 1: Data with single chunk, varied chunk size

Hypothesis: Larger chunk sizes mean slower tile times, at all zoom levels.

### Step 1: Generate dataset specs

In [4]:
# Fake data directory
data_path = 'fake_data/single_chunk'
directories = s3_fs.ls(f'{bucket}/{data_path}')

datastore_specs = zarr_helpers.get_dataset_specs_from_directory(directories)

### Step 2: Inspect the dataset specs

In [5]:
df = pd.DataFrame.from_dict(datastore_specs, orient='index')
drop_columns = ['collection_name', 'variable', 'source', 'dtype', 'number_coord_chunks', 'compression']
df.drop(columns=drop_columns).sort_values('chunk_size_mb')

Unnamed: 0,shape,lat_resolution,lon_resolution,chunk_size_mb,chunks,number_of_chunks
store_lat512_lon1024.zarr,"{'time': 1, 'lat': 512, 'lon': 1024}",0.35225,0.351906,4.0,"{'time': 1, 'lat': 512, 'lon': 1024}",1
store_lat724_lon1448.zarr,"{'time': 1, 'lat': 724, 'lon': 1448}",0.248963,0.248791,7.998291,"{'time': 1, 'lat': 724, 'lon': 1448}",1
store_lat1024_lon2048.zarr,"{'time': 1, 'lat': 1024, 'lon': 2048}",0.175953,0.175867,16.0,"{'time': 1, 'lat': 1024, 'lon': 2048}",1
store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}",0.124395,0.124352,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",1
store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}",0.087934,0.087912,64.0,"{'time': 1, 'lat': 2048, 'lon': 4096}",1
store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 2896, 'lon': 5792}",0.062176,0.062165,127.972656,"{'time': 1, 'lat': 2896, 'lon': 5792}",1
store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}",0.043956,0.043951,256.0,"{'time': 1, 'lat': 4096, 'lon': 8192}",1


### Step 3: Run the tests

In [6]:
%%time
zooms = range(12)
# by default tests are only run twice
test_results = zarr_helpers.run_tests(datastore_specs, zooms, niters=10)

CPU times: user 3min 33s, sys: 1min 30s, total: 5min 3s
Wall time: 11min 56s


#### (Optional) Inspect the results

In [7]:
df = pd.DataFrame.from_dict(
    test_results,
    orient='index',
    # call also add 'all tile times'
    columns=['chunk_size_mb', 'mean tile time', 'number_of_chunks', 'all tile times']
).sort_values('chunk_size_mb')

df['all tile times']

store_lat512_lon1024.zarr     {'0 tests': [411.33, 189.48, 209.19, 225.09, 1...
store_lat724_lon1448.zarr     {'0 tests': [433.33, 552.31, 196.37, 174.95, 1...
store_lat1024_lon2048.zarr    {'0 tests': [551.09, 414.42, 264.5, 260.96, 25...
store_lat1448_lon2896.zarr    {'0 tests': [863.04, 499.58, 419.27, 417.07, 4...
store_lat2048_lon4096.zarr    {'0 tests': [1671.36, 907.43, 927.72, 828.05, ...
store_lat2896_lon5792.zarr    {'0 tests': [2796.42, 1997.0, 1514.12, 1545.79...
store_lat4096_lon8192.zarr    {'0 tests': [6326.43, 3364.04, 3169.24, 3005.5...
Name: all tile times, dtype: object

### Step 4: Plot results

#### Time as function of chunk size

We see that as chunk size increases, so does time, with lower zoom levels seeing the most latency.

Question: Why is there so much variation in tile time at the larger chunk sizes? Is it only re-projecting a subset of the chunk at higher zoom levels?

In [10]:
# want to plot time as a function of chunksize, with multiple lines for each zoom

for zoom in zooms:
    df[zoom] = df['mean tile time'].apply(lambda x: x[zoom])

zooms_as_str = list(map(str, zooms))
df.plot.scatter(x='chunk_size_mb', y=zooms_as_str, value_label='Time in ms', group_label='zoom')

## Test 2: Data with multiple chunks

Hypothesis: More chunks, when chunk size is constant, results in slower tile times. Performance is much worse at lower zoom levels (zoom 0, 1, 2) because more chunks must be loaded to generate the tile.

### Step 1: Generate dataset specs

In [11]:
# Fake data directory
data_path = 'fake_data/with_chunks'
directories = s3_fs.ls(f'{bucket}/{data_path}')

datastore_specs = zarr_helpers.get_dataset_specs_from_directory(directories)

### Step 2: Inspect the dataset specs

In [12]:
df = pd.DataFrame.from_dict(datastore_specs, orient='index')
df.drop(columns=drop_columns).sort_values('number_of_chunks')

Unnamed: 0,shape,lat_resolution,lon_resolution,chunk_size_mb,chunks,number_of_chunks
store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}",0.124395,0.124352,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",1
store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}",0.087934,0.087912,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",2
store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 2896, 'lon': 5792}",0.062176,0.062165,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",4
store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}",0.043956,0.043951,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",8
store_lat5793_lon11586.zarr,"{'time': 1, 'lat': 5793, 'lon': 11586}",0.031077,0.031075,31.993164,"{'time': 1, 'lat': 1448, 'lon': 2896}",16


### Step 3: Run the tests

In [17]:
%%time
zooms = range(12) # Zoom 10 is the level at which you can see large roads, 15 is buildings
test_results = zarr_helpers.run_tests(datastore_specs, zooms, niters=10)

CPU times: user 2min 2s, sys: 28.2 s, total: 2min 30s
Wall time: 4min 29s


#### (Optional) Inspect the results

In [18]:
df = pd.DataFrame.from_dict(
    test_results,
    orient='index',
    columns=['chunk_size_mb', 'mean tile time', 'number_of_chunks']
).sort_values('chunk_size_mb')

df

Unnamed: 0,chunk_size_mb,mean tile time,number_of_chunks
store_lat1448_lon2896.zarr,31.993164,"{0: 439.03999999999996, 1: 413.895, 2: 405.812...",1
store_lat2048_lon4096.zarr,31.993164,"{0: 493.116, 1: 411.13199999999995, 2: 367.609...",2
store_lat2896_lon5792.zarr,31.993164,"{0: 575.02, 1: 442.4889999999999, 2: 420.86500...",4
store_lat4096_lon8192.zarr,31.993164,"{0: 1055.986, 1: 499.014, 2: 418.601, 3: 386.3...",8
store_lat5793_lon11586.zarr,31.993164,"{0: 2532.163, 1: 739.38, 2: 566.3979999999999,...",16


### Step 4: Plot the results

#### Time as a function of number of chunks

Fewer chunks is means faster load times. Once you have 16 spatial chunks at 8MB, you see the time in ms at zoom 0 go to 339ms.

In [19]:
# want to plot time as a function of chunksize, with multiple lines for each zoom

for zoom in zooms:
    df[zoom] = df['mean tile time'].apply(lambda x: x[zoom])

zooms = list(map(str, range(len(zooms))))
df.plot.scatter(x='number_of_chunks', y=zooms, value_label='Time in ms', group_label='zoom')

#### Time as a function of zoom

At higher zooms, number of chunks doesn't matter (when chunks are the same size). At low zooms, more chunks means longer time to tile, which makes intuitive sense as more chunks need to be fetched to generate the tile.

In [20]:
df2 = df.drop(columns=['mean tile time']).melt(id_vars=['chunk_size_mb', 'number_of_chunks'], var_name='zoom')
df2.plot.scatter(x='zoom', y='value', by='number_of_chunks')

In [22]:
df

Unnamed: 0,chunk_size_mb,mean tile time,number_of_chunks,0,1,2,3,4,5,6,7,8,9,10,11
store_lat1448_lon2896.zarr,31.993164,"{0: 439.03999999999996, 1: 413.895, 2: 405.812...",1,439.04,413.895,405.812,404.49,405.879,401.889,404.176,404.142,403.964,400.557,401.96,407.395
store_lat2048_lon4096.zarr,31.993164,"{0: 493.116, 1: 411.13199999999995, 2: 367.609...",2,493.116,411.132,367.609,301.604,267.022,285.493,291.262,284.372,297.276,326.77,252.357,340.214
store_lat2896_lon5792.zarr,31.993164,"{0: 575.02, 1: 442.4889999999999, 2: 420.86500...",4,575.02,442.489,420.865,410.959,412.406,408.564,398.361,414.552,423.302,404.827,407.266,411.218
store_lat4096_lon8192.zarr,31.993164,"{0: 1055.986, 1: 499.014, 2: 418.601, 3: 386.3...",8,1055.986,499.014,418.601,386.388,382.221,420.967,376.277,380.314,407.856,354.392,374.149,392.048
store_lat5793_lon11586.zarr,31.993164,"{0: 2532.163, 1: 739.38, 2: 566.3979999999999,...",16,2532.163,739.38,566.398,420.011,404.796,401.815,417.737,420.377,421.637,426.957,409.846,398.728


Once we determine what is the threshold for time, we should be able to determine if we need to chunk and / or pyramid.

In this scenario, if our threshold is 100ms, the chunk size should be 32mb. If the resolution of our data is such that we have more than 2 chunks of that size, we should pyramid the data for zoom levels 0, 1 and 2.

#### What is the relationship between number of chunks and time to tile at zoom 0 and at zoom 11?

In [24]:
import statsmodels.api as sm
import pandas as pd

# Create the design matrix
X = df[['number_of_chunks']]
X = sm.add_constant(X)  # Adds a constant term for the intercept

# Dependent variable
y = df[0]

# Create the model
model = sm.OLS(y, X)

# Fit the model
results = model.fit()

# Display the summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.965
Model:                            OLS   Adj. R-squared:                  0.953
Method:                 Least Squares   F-statistic:                     81.61
Date:                Fri, 25 Aug 2023   Prob (F-statistic):            0.00286
Time:                        21:07:11   Log-Likelihood:                -32.091
No. Observations:                   5   AIC:                             68.18
Df Residuals:                       3   BIC:                             67.40
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const              140.0303    129.608  

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [26]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

# Create the design matrix
X = df[['number_of_chunks']].values

# Dependent variable
y = df[0].values

# Create the model
model = LinearRegression()

# Fit the model
model.fit(X, y)

# Coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

# Prediction example
new_data = np.array([[20]])  # replace with your own new data
prediction = model.predict(new_data)
print("Prediction:", prediction)


Intercept: 140.03029166666693
Coefficients: [141.77979167]
Prediction: [2975.626125]
