# Comparing dev and feature

This notebook compares results between dev and feature titiler deployments. Running end-to-end benchmarks is documented in [https://github.com/developmentseed/tile-benchmarking/tree/main/03-e2e/README.md](https://github.com/developmentseed/tile-benchmarking/tree/main/03-e2e/README.md).

This notebook is comparing titiler-xarray's dev branch at [commit 9ac1686612d](https://github.com/developmentseed/titiler-xarray/commit/9ac1686612d706e0f078a418818b16544efb11c0) with a feature deployment that includes [diskcache](https://github.com/developmentseed/titiler-xarray/commit/283bc839e081d8cfe8bc730cf3fc0f2d344e3ec4)

In [1]:
# Import libraries
import os
import pandas as pd
import hvplot.pandas
import holoviews as hv
pd.options.plotting.backend = 'holoviews'
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('..')
from helpers import dataframe
# You will need to set credentials to access nasa-eodc-data-store
# import eodc_hub_role
# credentials = eodc_hub_role.fetch_and_set_credentials()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [10]:
# Remove any previous results
!rm -rf downloaded_dev_results/
!rm -rf downloaded_feature_results/

In [11]:
%%capture
!aws s3 cp --recursive s3://nasa-eodc-data-store/tile-benchmarking-results/2023-10-20_18-22-23/ downloaded_dev_results/
!aws s3 cp --recursive s3://nasa-eodc-data-store/tile-benchmarking-results/2023-10-20_18-13-40/ downloaded_feature_results/

Parse and merge results into a single dataframe.

In [2]:
results = { 'feature': {}, 'dev': {} }
for env in results.keys():
    # Specify the directory path and the suffix
    directory_path = f"downloaded_{env}_results/"
    suffix = "_urls_stats.csv"  # For example, if you're interested in text files

    # List all files in the directory
    all_files = os.listdir(directory_path)

    # Filter the files to only include those that end with the specified suffix
    files_with_suffix = [f"{directory_path}{f}" for f in all_files if f.endswith(suffix)]

    dfs = []
    for file in files_with_suffix:
        df = pd.read_csv(file)
        df['file'] = file
        dfs.append(df)

    merged_df = pd.concat(dfs)
    merged_df['dataset'] = [file.split('/')[1].replace('_urls_stats.csv', '') for file in merged_df['file']]
    results[env]['all'] = merged_df
    # The "Aggregated" results represent aggregations across tile endpoints. 
    results[env][f'Aggregated {env}'] = merged_df[merged_df['Name'] == 'Aggregated']

In [3]:
dataset_specs_all = dataframe.csv_to_pandas('zarr_info.csv')
#dataset_specs_all

In [12]:
dev_df = results['dev'][f'Aggregated dev']
feature_df = results['feature'][f'Aggregated feature']
merged_df = pd.merge(dev_df, feature_df,  on='dataset', suffixes=(' Dev', ' Feature'))
merged_df['Failure Rate Dev'] = merged_df['Failure Count Dev']/merged_df['Request Count Dev'] * 100
merged_df['Failure Rate Feature'] = merged_df['Failure Count Feature']/merged_df['Request Count Feature'] * 100

summary_df = merged_df[['Average Response Time Dev', 'Failure Rate Dev', 'Average Response Time Feature', 'Failure Rate Feature', 'dataset']].sort_values('Average Response Time Dev')
merged_specs = summary_df.merge(dataset_specs_all, left_on='dataset', right_on='collection_name')

In [15]:
merged_specs

Unnamed: 0,Average Response Time Dev,Failure Rate Dev,Average Response Time Feature,Failure Rate Feature,dataset,collection_name,source,chunks,shape_dict,dtype,chunk_size_mb,compression,number_of_spatial_chunks,number_coordinate_chunks
0,289.621658,0.0,231.326141,0.0,single_chunk_store_lat512_lon1024.zarr,single_chunk_store_lat512_lon1024.zarr,s3://nasa-eodc-data-store/test-data/fake-data/...,"{'y': 1, 'x': 512}","{'y': 512, 'x': 1024}",float64,4.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",1024.0,2.0
1,319.338632,0.0,207.03856,0.0,600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr,600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr,s3://nasa-eodc-data-store/test-data/cmip6-zarr...,"{'y': 1, 'x': 600}","{'y': 600, 'x': 1440}",float32,3.295898,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",1440.0,1.0
2,332.697947,0.0,244.945579,0.0,cmip6-kerchunk,cmip6-kerchunk,s3://nasa-eodc-data-store/test-data/cmip6-kerc...,"{'y': 1, 'x': 600}","{'y': 600, 'x': 1440}",float32,3.295898,Zlib(level=5),1440.0,1.0
3,334.265461,0.0,275.572622,0.0,single_chunk_store_lat724_lon1448.zarr,single_chunk_store_lat724_lon1448.zarr,s3://nasa-eodc-data-store/test-data/fake-data/...,"{'y': 1, 'x': 724}","{'y': 724, 'x': 1448}",float64,7.998291,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",1448.0,2.0
4,404.687215,0.0,331.215422,0.0,single_chunk_store_lat1024_lon2048.zarr,single_chunk_store_lat1024_lon2048.zarr,s3://nasa-eodc-data-store/test-data/fake-data/...,"{'y': 1, 'x': 1024}","{'y': 1024, 'x': 2048}",float64,16.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",2048.0,2.0
5,492.20542,0.0,418.354628,0.0,with_chunks_store_lat2048_lon4096.zarr,with_chunks_store_lat2048_lon4096.zarr,s3://nasa-eodc-data-store/test-data/fake-data/...,"{'y': 1, 'x': 1448}","{'y': 2048, 'x': 4096}",float64,31.993164,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",5793.237569,2.0
6,519.183636,0.0,307.48783,0.0,power_901_monthly_meteorology_utc.zarr,power_901_monthly_meteorology_utc.zarr,s3://power-analysis-ready-datastore/power_901_...,"{'y': 504, 'x': 25}","{'y': 361, 'x': 576}",float64,2.403259,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",16.502857,2.0
7,519.816283,0.0,218.855533,0.0,pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc,pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc,https://nex-gddp-cmip6.s3-us-west-2.amazonaws....,"{'y': 'N', 'x': '/'}","{'y': 600, 'x': 1440}",float32,,,,0.0
8,558.456119,0.0,359.963568,0.0,aws-noaa-oisst-feedstock_reference,aws-noaa-oisst-feedstock_reference,https://ncsa.osn.xsede.org/Pangeo/pangeo-forge...,"{'zlev': 1, 'y': 1, 'x': 720}","{'zlev': 1, 'y': 720, 'x': 1440}",int16,1.977539,Zlib(level=4),1440.0,2.0
9,567.081987,0.0,492.673099,0.0,with_chunks_store_lat1448_lon2896.zarr,with_chunks_store_lat1448_lon2896.zarr,s3://nasa-eodc-data-store/test-data/fake-data/...,"{'y': 1, 'x': 1448}","{'y': 1448, 'x': 2896}",float64,31.993164,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, ...",2896.0,2.0


NOTE: We don't have chunk information for prod giovanni cache dataset since it is protected (it can be added).

In [16]:
dev_line = merged_specs.sort_values('chunk_size_mb').hvplot.line(
    x='chunk_size_mb', y='Average Response Time Dev', label='Dev', color='cyan',
    xlim=(0, 150), ylim=(0, 3000)
)

# Plot 'col2'
feature_line = merged_specs.sort_values('chunk_size_mb').hvplot.line(
    x='chunk_size_mb', y='Average Response Time Feature', label='Feature', color='magenta', alpha=0.4,
    xlim=(0, 150), ylim=(0, 3000)
)

# Combine the two line plots
combined_plot = dev_line * feature_line
combined_plot