# Create a linear regression model

## Explanation

In this notebook we use all of the sample datasets to create a linear regression model to estimate time to generate tiles. This way we can explore the optimal size of a chunk given some performance tolerance (i.e. milliseconds).

## Setup

In [1]:
# External modules
import hvplot.pandas
import holoviews as hv
import json
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import statsmodels.api as sm

# Local modules
import sys; sys.path.append('..')
import helpers.eodc_hub_role as eodc_hub_role
import helpers.dataframe as dataframe_helpers
from xarray_tile_test import XarrayTileTest

In [2]:
credentials = eodc_hub_role.fetch_and_set_credentials()

Load the fake datasets which have increasing numbers of chunks (but all at the same chunk size, 32MB).

In [3]:
# Run 3 iterations of each setting
iterations = 20
zooms = range(6)
all_zarr_datasets = json.loads(open('../01-generate-datasets/all-datasets.json').read())
# TODO: add pyramid into tests + modeling
filter_pyramids = list(filter(lambda x: 'pyramid' not in x[0], all_zarr_datasets.items()))
# Also, skip HTTPS for now
filter_https = list(filter(lambda x: 'https' not in x[1]['dataset_url'], filter_pyramids))

## Run Tests

In [5]:
results = []

for zarr_dataset_id, zarr_dataset in filter_https:
    zarr_tile_test = XarrayTileTest(
        dataset_id=zarr_dataset_id,
        **zarr_dataset
    )

    # Run it 3 times for each zoom level
    for zoom in zooms:
        zarr_tile_test.run_batch({'zoom': zoom}, batch_size=iterations)

    results.append(zarr_tile_test.store_results(credentials))

Wrote instance data to s3://nasa-eodc-data-store/test-results/20231006200047_XarrayTileTest_cmip6-kerchunk.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20231006200137_XarrayTileTest_600_1440_29_CMIP6_daily_GISS-E2-1-G_tas.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20231006200159_XarrayTileTest_600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20231006200317_XarrayTileTest_365_262_262_CMIP6_daily_GISS-E2-1-G_tas.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20231006200450_XarrayTileTest_power_901_monthly_meteorology_utc.zarr.json
Wrote instance data to s3://nasa-eodc-data-store/test-results/20231006200542_XarrayTileTest_cmip6-pds_GISS-E2-1-G_historical_tas.json
skipping this tile (8, 3, 3) for s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1024_lon2048.zarr due to error: Transformed bounds crossed the antimeridian. Please transfor

## Create a model

In [6]:
all_df = dataframe_helpers.load_all_into_dataframe(credentials, results)
expanded_df = dataframe_helpers.expand_timings(all_df)

In [7]:
# Explore results for a specific dataset
for zoom in range(6):
    dataset_id = 'power_901_monthly_meteorology_utc.zarr'
    result = expanded_df[(expanded_df['dataset_id'] == dataset_id) & (expanded_df['zoom'] == zoom)]
    print(f"Mean for dataset id {dataset_id} at zoom {zoom}: {np.mean(result.time)}")

Mean for dataset id power_901_monthly_meteorology_utc.zarr at zoom 0: 2298.2950000000005
Mean for dataset id power_901_monthly_meteorology_utc.zarr at zoom 1: 821.702
Mean for dataset id power_901_monthly_meteorology_utc.zarr at zoom 2: 487.8185
Mean for dataset id power_901_monthly_meteorology_utc.zarr at zoom 3: 385.19849999999997
Mean for dataset id power_901_monthly_meteorology_utc.zarr at zoom 4: 336.55649999999997
Mean for dataset id power_901_monthly_meteorology_utc.zarr at zoom 5: 323.64750000000004


In [8]:
df_melted = expanded_df.melt(id_vars=['chunk_size_mb', 'number_of_spatial_chunks', 'zoom'], value_vars=['time']).drop(columns=['variable'])

In [9]:
df_encoded = df_melted.copy()
df_encoded['zoom'] = df_encoded['zoom'].astype(int)
for zoom in zooms:
    df_encoded[f'zoom {zoom}'] = np.where(df_melted['zoom'] == zoom, 1, 0)
 

In [10]:
df_encoded


Unnamed: 0,chunk_size_mb,number_of_spatial_chunks,zoom,value,zoom 0,zoom 1,zoom 2,zoom 3,zoom 4,zoom 5
0,3.295898,1.0,0,203.33,1,0,0,0,0,0
1,3.295898,1.0,0,218.70,1,0,0,0,0,0
2,3.295898,1.0,0,351.83,1,0,0,0,0,0
3,3.295898,1.0,0,169.36,1,0,0,0,0,0
4,3.295898,1.0,0,258.45,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2155,31.993164,16.005525,5,410.88,0,0,0,0,0,1
2156,31.993164,16.005525,5,440.88,0,0,0,0,0,1
2157,31.993164,16.005525,5,420.27,0,0,0,0,0,1
2158,31.993164,16.005525,5,419.04,0,0,0,0,0,1


In [11]:
ind_variables = ['number_of_spatial_chunks', 'chunk_size_mb']
ind_variables.extend([f'zoom {zoom}' for zoom in zooms])
print(ind_variables)

['number_of_spatial_chunks', 'chunk_size_mb', 'zoom 0', 'zoom 1', 'zoom 2', 'zoom 3', 'zoom 4', 'zoom 5']


In [13]:
# Create the design matrix
X = df_encoded[['number_of_spatial_chunks', 'chunk_size_mb', 'zoom 0', 'zoom 1', 'zoom 2', 'zoom 3', 'zoom 4', 'zoom 5']]
y = df_encoded['value']

X_with_const = sm.add_constant(X)
# Create and fit the model
model = sm.OLS(y, X_with_const.astype(float)).fit()

# Display summary
print(model.summary())
model.save("app-code-model.pickle")

                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.751
Method:                 Least Squares   F-statistic:                     929.1
Date:                Fri, 06 Oct 2023   Prob (F-statistic):               0.00
Time:                        20:25:29   Log-Likelihood:                -15654.
No. Observations:                2160   AIC:                         3.132e+04
Df Residuals:                    2152   BIC:                         3.137e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   