# Generate fake data with a single chunk

In this notebook, we generate fake data of various resolution with a single chunk in order to determine the relationship between chunk size and time to tile.

## Step 1. Import necessary libraries

In [1]:
import xarray as xr
import numpy as np
import os
import s3fs
import sys; sys.path.append('..')
import eodc_hub_role
import zarr_helpers

## Step 2: Setup data storage

Store data in the fake data directory in a "with chunks".

In [14]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
fake_data_dir = 'fake_data/with_chunks'
s3_fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)

## Step 3: Define starting starting conditions

The following are set as variables so tests can be modified easily for different starting conditions. For example, we might want to test a different target size.

In [14]:
# Define dimensions
time_steps = 1
y = 512
x = 1024
multiple = 2 # how much do you want the dataset to grow by each iteration
n_multiples = 7

In [15]:
for n_multiple in range(n_multiples):
    if n_multiple == 0:
        size = y * x        
    else:
        size = y * x * multiple

    x = round(np.sqrt(2 * size))
    y = int(x/2)
    data = np.random.random(size=(time_steps, y, x))

    # Create Xarray datasets with dimensions and coordinates
    ds = xr.Dataset({
        'data': (['time', 'lat', 'lon'], data),
    }, coords={
        'time': np.arange(time_steps),
        'lat': np.linspace(-90, 90, y),
        'lon': np.linspace(-180, 180, x)
    })

    # Save Xarray datasets as Zarr stores
    chunks={'time': 1, 'lat': y, 'lon': x}
    ds = ds.chunk(chunks)    
    ds.to_zarr(f'{fake_data_dir}/store_lat_{y}x_lon_{x}.zarr', mode='w')

In [16]:
# List all items in the directory
items = os.listdir(fake_data_dir)

# Loop through each item and open it with xarray if it's a Zarr store
for item in items:
    item_path = os.path.join(fake_data_dir, item)
    # Check if the item is a directory (Zarr stores are directories)
    if os.path.isdir(item_path):
        try:
            # Attempt to open the Zarr store using xarray
            ds = xr.open_zarr(item_path)
            print(f"Chunk size")
            print(zarr_helpers.get_chunk_size(ds['data']))
            print('-' * 80)  # Print a separator line
        except Exception as e:
            # Print an error message if unable to open the Zarr store
            print(f"Could not open {item} as a Zarr store. Error: {e}")


Chunk size
((1, 724, 1448), dtype('float64'), 7.998291015625)
--------------------------------------------------------------------------------
Chunk size
((1, 1448, 2896), dtype('float64'), 31.9931640625)
--------------------------------------------------------------------------------
Chunk size
((1, 1024, 2048), dtype('float64'), 16.0)
--------------------------------------------------------------------------------
Chunk size
((1, 512, 1024), dtype('float64'), 4.0)
--------------------------------------------------------------------------------
Chunk size
((1, 4096, 8192), dtype('float64'), 256.0)
--------------------------------------------------------------------------------
Chunk size
((1, 2896, 5793), dtype('float64'), 127.9947509765625)
--------------------------------------------------------------------------------
Could not open .ipynb_checkpoints as a Zarr store. Error: No such file or directory: '/Users/aimeebarciauskas/github/developmentseed/tile-benchmarking/profiling/fake_