# Using dask to scale the analysis of data on the cloud

## Explore data with xarray

In [None]:
import xarray as xr

In [None]:
# Let's open a dataset that is sitting on the cloud
store = 'https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/cmip6-feedstock/CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Omon.zos.gn.v20190429.zarr'
ds = xr.open_dataset(store, engine='zarr', chunks={})
ds

In [None]:
# We look at one variable 
ds['zos']

In [None]:
# We plot a map at one date
ds['zos'].sel(time="2014-01-16").plot()

In [None]:
# A time serie at one location
ds['zos'].isel(i=100,j=100).plot()

An important aspect of xarray is that it allows lazily-evaluated operations.

That means that computation is not done right away but planned for when we actually need the result, to plot or print for instance.

In [None]:
%%time
#That is why this is quasi instantaneous
ds['zos'].mean(dim='time')

In [None]:
%%time
# and this takes some time
ds['zos'].mean(dim='time').plot()

In [None]:
%%time
# A more complex operation : seasonal mean
ssh_seasonal = ds['zos'].groupby("time.season").mean()
ssh_seasonal.plot(col="season", col_wrap=2)

## Let's use xarray with dask

In [None]:
import dask

Click on the dask symbol on the left, then on +NEW button

Wait for the cluster to launch, then drag and drop the blue box in a cell below, and execute it

Select some dashboard metrics to follow : Progress, Task Stream, CPU and Cluster Memory, and rearrange the lab windows

Let's open a new dataset

In [None]:
#This is a satellite product of the sea surface height available from anywhere thanks to the pangeo catalog
from intake import open_catalog
cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/ocean.yaml")
ds  = cat["sea_surface_height"].to_dask()
ds

In [None]:
# One variable size in Gb
ds.sla.nbytes/1e9

In [None]:
# We only have 15Gb available but we can still handle this variable
xymean = ds.sla.mean(dim=('latitude', 'longitude'))

In [None]:
%%time
# Nothing is happenning while we do not load the computation
xymean.load()

In [None]:
%%time
#when we run it a second time it will be even faster
xymean.load()

In [None]:
# That is why this plot is so fast

import matplotlib.pyplot as plt
xymean.plot(label='full data')
xymean.rolling(time=365, center=True).mean().plot(label='rolling annual mean')
plt.ylabel('Sea Level Anomaly [m]')
plt.title('Global Mean Sea Level')
plt.legend()
plt.grid()

In [None]:
#Let's have a look at the chunks
ds.sla

In [None]:
#Another example of chunking strategy
from intake import open_catalog

cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/ocean/MEOM-NEMO.yaml")
ds  = cat["eNATL60_BLBT02_SSU"].to_dask()
ds.sozocrtx