# Create parquet files for analysis

## Imports and config

In [1]:
import tqdm
import xarray as xr
import pathlib
import dask
import dask.distributed
import coiled
from global_snowmelt_runoff_onset.config import Config
from global_snowmelt_runoff_onset.analysis import create_and_save_analysis_parquet

  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,


In [2]:
config = Config('../config/global_config.txt')

Configuration loaded:
resolution = 0.00072000072000072
spatial_chunk_dim = 2048
bbox_left = -179.999
bbox_right = 179.999
bbox_top = 81.099
bbox_bottom = -59.999
wy_start = 2015
wy_end = 2024
low_backscatter_threshold = 0.001
min_monthly_acquisitions = 2
max_allowed_days_gap_per_orbit = 30
min_years_for_median_std = 3
valid_tiles_geojson_path = ../processing/valid_tiles.geojson
tile_results_path = ../processing/tile_results.csv
global_runoff_zarr_store_azure_path = snowmelt/snowmelt_runoff_onset/global.zarr
seasonal_snow_mask_zarr_store_azure_path = snowmelt/snow_mask_v2/global_modis_snow_mask.zarr


In [3]:
global_ds = xr.open_zarr(config.global_runoff_store, consolidated=True,decode_coords='all')

## view tiles

In [4]:
config.valid_tiles_gdf.explore(column='success')

## create parquets

In [None]:
cluster = coiled.Cluster(idle_timeout="10 minutes",
                         n_workers=30,
                         worker_memory="64 GB",
                         worker_cpu=8,
                         scheduler_memory="64 GB",
                         spot_policy="spot",
                         environ={"GDAL_DISABLE_READDIR_ON_OPEN": "EMPTY_DIR"},
                         workspace="uwtacolab", #"azure"
                         )

client = cluster.get_client()

In [5]:
tiles = config.get_list_of_tiles(which='processed')

In [6]:
len(tiles)

2269

In [7]:
existing_filenames = [pathlib.Path(path).name for path in config.azure_blob_fs.ls('snowmelt/analysis/tiles/')]
len(existing_filenames)

2225

In [None]:
try:
    existing_filenames = [pathlib.Path(path).name for path in config.azure_blob_fs.ls('snowmelt/analysis/tiles/')]
except:
    existing_filenames = []
        

tiles = [tile for tile in tiles if f'tile_{tile.row:03d}_{tile.col:03d}.parquet' not in existing_filenames]
filenames = [f'tile_{tile.row:03d}_{tile.col:03d}.parquet' for tile in tiles if f'tile_{tile.row:03d}_{tile.col:03d}.parquet' not in existing_filenames]

futures = []

for tile,filename in tqdm.tqdm(zip(tiles,filenames),total=len(filenames)):
    future = client.submit(create_and_save_analysis_parquet, tile, filename, config.azure_blob_fs, global_ds, config.ee_credentials, retries=3)
    futures.append(future)

In [None]:
for future,result in dask.distributed.as_completed(futures, with_results=True):
    if result[1] == True:
        print(f"Successfully processed tile {result[0]}")
    if result[1] == False:
        print(f"Failed for tile {result[0]} with error: {result[2]} and traceback: {result[3]}")

In [None]:
config = Config('../config/global_config.txt')
tiles = config.get_list_of_tiles(which='processed')

try:
    existing_filenames = [pathlib.Path(path).name for path in config.azure_blob_fs.ls('snowmelt/analysis/tiles/')]
except:
    existing_filenames = []
        

tiles = [tile for tile in tiles if f'tile_{tile.row:03d}_{tile.col:03d}.parquet' not in existing_filenames]
filenames = [f'tile_{tile.row:03d}_{tile.col:03d}.parquet' for tile in tiles if f'tile_{tile.row:03d}_{tile.col:03d}.parquet' not in existing_filenames]

futures = []

for tile,filename in tqdm.tqdm(zip(tiles,filenames),total=len(filenames)):
    future = client.submit(create_and_save_analysis_parquet, tile, filename, config.azure_blob_fs, global_ds, config.ee_credentials, retries=3)
    futures.append(future)

for future,result in dask.distributed.as_completed(futures, with_results=True):
    if result[1] == True:
        print(f"Successfully processed tile {result[0]}")
    if result[1] == False:
        print(f"Failed for tile {result[0]} with error: {result[2]} and traceback: {result[3]}")

In [None]:
config = Config('../config/global_config.txt')
tiles = config.get_list_of_tiles(which='processed')

try:
    existing_filenames = [pathlib.Path(path).name for path in config.azure_blob_fs.ls('snowmelt/analysis/tiles/')]
except:
    existing_filenames = []
        

tiles = [tile for tile in tiles if f'tile_{tile.row:03d}_{tile.col:03d}.parquet' not in existing_filenames]
filenames = [f'tile_{tile.row:03d}_{tile.col:03d}.parquet' for tile in tiles if f'tile_{tile.row:03d}_{tile.col:03d}.parquet' not in existing_filenames]

futures = []

for tile,filename in tqdm.tqdm(zip(tiles,filenames),total=len(filenames)):
    future = client.submit(create_and_save_analysis_parquet, tile, filename, config.azure_blob_fs, global_ds, config.ee_credentials, retries=3)
    futures.append(future)

for future,result in dask.distributed.as_completed(futures, with_results=True):
    if result[1] == True:
        print(f"Successfully processed tile {result[0]}")
    if result[1] == False:
        print(f"Failed for tile {result[0]} with error: {result[2]} and traceback: {result[3]}")

In [None]:
parquet_files = config.azure_blob_fs.ls('snowmelt/analysis/tiles/')
parquet_files

In [None]:
len(parquet_files)

In [None]:
len(existing_filenames)

## Code graveyard

In [None]:
# tile = config.get_tile(14,27)
# tile.geobox.explore(tiles='EsriWorldImagery')

# tile_utm_ds = create_utm_datacube(tile, global_ds)
# tile_utm_ds


# # Get the number of variables
# n_vars = len(tile_utm_ds.data_vars)

# # Calculate the number of rows and columns for the subplots
# n_cols = 4  # You can adjust this number as needed
# n_rows = (n_vars + n_cols - 1) // n_cols

# # Create the figure and subplots
# f,axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
# axs = axs.flatten()  # Flatten the axes array for easy indexing

# # Plot each variable
# for i, (var_name, da) in enumerate(tile_utm_ds.data_vars.items()):
#     da.plot(ax=axs[i], cmap='viridis')
#     axs[i].set_title(var_name)

# # Remove any unused subplots
# for j in range(i+1, len(axs)):
#     f.delaxes(axs[j])

# f.tight_layout()

# tile_utm_df = dataset_to_dataframe(tile,tile_utm_ds)
# tile_utm_df

In [None]:
#view_tile(Tile(13,126)) very dense norway
#view_tile(Tile(1,133)) svalbard
#view_tile(Tile(88,72)) SA
#Tile(88,72).get_geobox().boundingbox
#test_ds = global_ds.rio.clip_box(-75,-51,-72,-48,crs='EPSG:4326') area surrounding SA tile, other tiles should be adjacenent
#f,ax=plt.subplots(2,1,figsize=(10,10))
#test_ds['runoff_onset_median'].plot.imshow(ax=ax[0],vmin=0,vmax=365)
#test_ds['runoff_onset_std'].plot.imshow(ax=ax[1],cmap='Reds')
#test_ds['runoff_onset'].plot.imshow(col='water_year',col_wrap=3,vmin=0,vmax=365)


# def view_tile(tile: Tile):


#     test_ds = global_ds.rio.clip_box(*tile.get_geobox().boundingbox,crs='EPSG:4326')

#     f,ax=plt.subplots(2,1,figsize=(10,10))
#     test_ds['runoff_onset_median'].plot.imshow(ax=ax[0],vmin=0,vmax=365)

#     test_ds['runoff_onset_std'].plot.imshow(ax=ax[1],cmap='Reds')

#     test_ds['runoff_onset'].plot.imshow(col='water_year',col_wrap=3,vmin=0,vmax=365)

In [None]:
# tile_ds = global_ds.rio.clip_box(*tile.get_geobox().boundingbox,crs='EPSG:4326').compute()
# tile_ds

# tile_ds = add_coordinate_arrays(tile_ds)
# tile_ds
# utm_crs = tile_ds.rio.estimate_utm_crs()
# tile_utm_ds = tile_ds.rio.reproject(utm_crs,resolution=80,resampling=rasterio.enums.Resampling.bilinear)
# tile_utm_ds

# tile_utm_ds['runoff_onset'].plot.imshow(col='water_year',col_wrap=3,robust=True)

# f,axs = plt.subplots(1,2,figsize=(10,5))

# tile_utm_ds['runoff_onset_median'].plot.imshow(ax=axs[0],robust=True)
# tile_utm_ds['runoff_onset_std'].plot.imshow(ax=axs[1],robust=True,cmap='Reds')

# for ax in axs:
#     ax.set_aspect('equal')

# f,axs=plt.subplots(1,2,figsize=(10,5))
# tile_utm_ds['original_lat'].plot.imshow(ax=axs[0])
# tile_utm_ds['original_lon'].plot.imshow(ax=axs[1])
# axs[0].set_title('original_lat')
# axs[1].set_title('original_lon')

# for ax in axs:
#     ax.set_aspect('equal')

# tile_utm_ds = convert_water_year_dim_to_var(tile_utm_ds)
# tile_utm_ds
# tile_utm_ds = add_topography(tile,tile_utm_ds)
# tile_utm_ds
# tile_utm_ds = add_snow_class(tile,tile_utm_ds)
# tile_utm_ds
# tile_utm_ds = add_esa_worldcover(tile,tile_utm_ds)
# tile_utm_ds
# tile_utm_ds = add_forest_cover(tile,tile_utm_ds)
# tile_utm_ds
# tile_utm_df = dataset_to_dataframe(tile_utm_ds,utm_crs,water_years)
# tile_utm_df
# tile_results_df = pd.read_csv(f'results/tile_{tile.row:03d}_{tile.col:03d}.csv')


# var_list = ['runoff_onset_median','runoff_onset_std','aspect','slope','tpi','snow_classification','esa_worldcover','forest_cover_fraction']
# tile_utm_ds.hvplot.image(z=var_list,tiles="EsriImagery",crs=tile_utm_ds.rio.crs,width=500,height=500) # hover_cols=var_list


# num_vars = len(tile_utm_ds.data_vars)

# # Calculate the number of rows and columns for the subplots
# num_cols = 3  # You can adjust this
# num_rows = (num_vars + num_cols - 1) // num_cols

# # Create a figure with subplots
# fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5*num_rows))
# axes = axes.flatten()  # Flatten the axes array for easy indexing

# # Loop through each variable and plot
# for i, (var_name, da) in enumerate(tile_utm_ds.data_vars.items()):
#     ax = axes[i]
#     da.plot(ax=ax)
#     ax.set_title(var_name)

# # Remove any unused subplots
# for j in range(i+1, len(axes)):
#     fig.delaxes(axes[j])

# # f,axs=plt.subplots(2,2,figsize=(10,10),sharex=True,sharey=True)


# # tile_utm_ds['dem'].plot.imshow(ax=axs[0,0])
# # tile_utm_ds['aspect'].plot.imshow(ax=axs[0,1],cmap='twilight')
# # tile_utm_ds['slope'].plot.imshow(ax=axs[1,0],cmap='Reds')
# # tile_utm_ds['tpi'].plot.imshow(ax=axs[1,1],cmap='Purples')

# # titles = ['DEM','Aspect','Slope','TPI']

# # for ax,title in zip(axs.flatten(),titles):
# #     ax.set_aspect('equal')
# #     ax.set_title(title)

# # f.tight_layout()
# # 
# # #hvplot.explorer(tile_utm_ds, x='x', y='y')
# # 
# # #hvplot.help("image")

# tile_utm_df.to_csv(f'results/tile_{tile.row:03d}_{tile.col:03d}.csv',index=False)