# Xarray Tile Testing

In [3]:
%%capture
!pip install -r ../requirements.txt

In [4]:
import boto3
import pandas as pd
import json
from xarray_tile_test import XarrayTileTest
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role
import warnings
warnings.filterwarnings("ignore")

In [5]:
credentials = eodc_hub_role.fetch_and_set_credentials()

# Run one test

As an example.

In [3]:
xarray_tile_test = XarrayTileTest(
    dataset_id='cmip6-kerchunk',
    dataset_url='s3://nasa-eodc-data-store/test-data/cmip6-kerchunk/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json',
    variable='tas',
    extra_args={'reference': True},
)

In [None]:
xarray_tile_test.array_specs

In [None]:
xarray_tile_test.run_batch({'zoom': 0})

In [None]:
xarray_tile_test.store_results(credentials)

## Run many tests

In [6]:
# TODO
zooms = range(12)
target_datasets = json.loads(open('../01-generate-datasets/all-datasets.json').read())
list(target_datasets.items())[0:2]

[('cmip6-kerchunk',
  {'dataset_url': 's3://nasa-eodc-data-store/test-data/cmip6-kerchunk/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json',
   'variable': 'tas',
   'extra_args': {'reference': True}}),
 ('power_901_monthly_meteorology_utc.zarr',
  {'dataset_url': 's3://power-analysis-ready-datastore/power_901_monthly_meteorology_utc.zarr',
   'variable': 'TS'})]

In [8]:
%%time
result_files = []
for dataset_id, dataset in target_datasets.items():
    xarray_tile_test = XarrayTileTest(
        dataset_id=dataset_id,
        **dataset
    )
    xarray_tile_test.run_batch({'zoom': 0}, batch_size=2)
    result_file = xarray_tile_test.store_results(credentials)
    result_files.append(result_file)

Wrote instance data to s3://nasa-eodc-data-store/test-results/20230901201139_XarrayTileTest_cmip6-kerchunk.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230901201148_XarrayTileTest_power_901_monthly_meteorology_utc.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230901201150_XarrayTileTest_cmip6-pds_GISS-E2-1-G_historical_tas.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230901201153_XarrayTileTest_aws-noaa-oisst-feedstock_reference.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230901201155_XarrayTileTest_600_1440_29_CMIP6_daily_GISS-E2-1-G_tas.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230901201156_XarrayTileTest_600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20230901201200_XarrayTileTest_365_262_262_CMIP6_daily_GISS-E2-1-G_tas.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results

## Read results

In [9]:
def load_all_into_dataframe(s3files: list[str]):
    boto3_session = boto3.Session(**credentials)
    s3_client = boto3_session.client('s3')
    dfs = []

    for s3url in s3files:
        df = pd.read_json(s3url, orient='index').T
        dfs.append(df)
 
    merged_df = pd.concat(dfs)
    merged_df.set_index('dataset_id', inplace=True)
    return merged_df


In [10]:
df = load_all_into_dataframe(result_files)
df.head()

Unnamed: 0_level_0,test_name,dataset_url,niters,variable,extra_args,timings,reference,lat_extent,lon_extent,number_coordinate_chunks,total_array_size,chunks,shape_dict,dtype,chunk_size_mb,compression,number_of_spatial_chunks
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
cmip6-kerchunk,XarrayTileTest,s3://nasa-eodc-data-store/test-data/cmip6-kerc...,1,tas,{'reference': True},"[[315.54, [0, 0, 0]], [130.19, [0, 0, 0]]]",True,"[-59, 89]","[-179, 179]",3,2406.005859,"{'time': 1, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,3.295898,Zlib(level=5),1.0
power_901_monthly_meteorology_utc.zarr,XarrayTileTest,s3://power-analysis-ready-datastore/power_901_...,1,TS,{},"[[5349.97, [0, 0, 0]], [2819.9, [0, 0, 0]]]",False,"[-90, 90]","[-180, 179]",43,780.521484,"{'time': 492, 'lat': 25, 'lon': 25}","{'time': 492, 'lat': 361, 'lon': 576}",float64,2.346039,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",332.6976
cmip6-pds_GISS-E2-1-G_historical_tas,XarrayTileTest,s3://cmip6-pds/CMIP6/CMIP/NASA-GISS/GISS-E2-1-...,1,tas,{},"[[737.2, [0, 0, 0]], [547.17, [0, 0, 0]]]",False,"[-89, 89]","[-178, 178]",6,97.888184,"{'time': 600, 'lat': 90, 'lon': 144}","{'time': 1980, 'lat': 90, 'lon': 144}",float32,29.663086,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",1.0
aws-noaa-oisst-feedstock_reference,XarrayTileTest,https://ncsa.osn.xsede.org/Pangeo/pangeo-forge...,1,sst,{'reference': True},"[[1049.3, [0, 0, 0]], [501.42, [0, 0, 0]]]",True,"[-89, 89]","[-179, 179]",4,29750.097656,"{'time': 1, 'zlev': 1, 'lat': 720, 'lon': 1440}","{'time': 15044, 'zlev': 1, 'lat': 720, 'lon': ...",int16,1.977539,Zlib(level=4),1.0
600_1440_29_CMIP6_daily_GISS-E2-1-G_tas.zarr,XarrayTileTest,s3://nasa-eodc-data-store/test-data/cmip6-zarr...,1,tas,{},"[[949.63, [0, 0, 0]], [610.59, [0, 0, 0]]]",False,"[-59, 89]","[-179, 179]",3,2406.005859,"{'time': 29, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,95.581055,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",1.0


In [24]:
# plot time as a function of chunk size
df_expanded = df.explode('timings')
df_expanded[['time', 'tile']] = pd.DataFrame(df_expanded['timings'].tolist(), index=df_expanded.index)
df_expanded['zoom'] = df_expanded['tile'].apply(lambda x: x[2])
df_expanded.head()

Unnamed: 0_level_0,test_name,dataset_url,niters,variable,extra_args,timings,reference,lat_extent,lon_extent,number_coordinate_chunks,total_array_size,chunks,shape_dict,dtype,chunk_size_mb,compression,number_of_spatial_chunks,time,tile,zoom
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
cmip6-kerchunk,XarrayTileTest,s3://nasa-eodc-data-store/test-data/cmip6-kerc...,1,tas,{'reference': True},"[315.54, [0, 0, 0]]",True,"[-59, 89]","[-179, 179]",3,2406.005859,"{'time': 1, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,3.295898,Zlib(level=5),1.0,315.54,"[0, 0, 0]",0
cmip6-kerchunk,XarrayTileTest,s3://nasa-eodc-data-store/test-data/cmip6-kerc...,1,tas,{'reference': True},"[130.19, [0, 0, 0]]",True,"[-59, 89]","[-179, 179]",3,2406.005859,"{'time': 1, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,3.295898,Zlib(level=5),1.0,130.19,"[0, 0, 0]",0
power_901_monthly_meteorology_utc.zarr,XarrayTileTest,s3://power-analysis-ready-datastore/power_901_...,1,TS,{},"[5349.97, [0, 0, 0]]",False,"[-90, 90]","[-180, 179]",43,780.521484,"{'time': 492, 'lat': 25, 'lon': 25}","{'time': 492, 'lat': 361, 'lon': 576}",float64,2.346039,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",332.6976,5349.97,"[0, 0, 0]",0
power_901_monthly_meteorology_utc.zarr,XarrayTileTest,s3://power-analysis-ready-datastore/power_901_...,1,TS,{},"[2819.9, [0, 0, 0]]",False,"[-90, 90]","[-180, 179]",43,780.521484,"{'time': 492, 'lat': 25, 'lon': 25}","{'time': 492, 'lat': 361, 'lon': 576}",float64,2.346039,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",332.6976,2819.9,"[0, 0, 0]",0
cmip6-pds_GISS-E2-1-G_historical_tas,XarrayTileTest,s3://cmip6-pds/CMIP6/CMIP/NASA-GISS/GISS-E2-1-...,1,tas,{},"[737.2, [0, 0, 0]]",False,"[-89, 89]","[-178, 178]",6,97.888184,"{'time': 600, 'lat': 90, 'lon': 144}","{'time': 1980, 'lat': 90, 'lon': 144}",float32,29.663086,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",1.0,737.2,"[0, 0, 0]",0


In [22]:
import hvplot
pd.options.plotting.backend = 'holoviews'

In [26]:
df_expanded.plot.scatter(x='chunk_size_mb', y='time', width=1000, height=500)

In [31]:
df_expanded.plot.scatter(x='number_of_spatial_chunks', y='time', width=1000, height=500)