In [None]:
# HIDDEN CELL
import sys, os

import numpy as np

# Importing argopy in dev mode:
on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
if not on_rtd:
    sys.path.insert(0, "/Users/gmaze/git/github/euroargodev/argopy")
    import git
    import argopy
    from argopy.options import OPTIONS
    print("argopy:", argopy.__version__, 
          "\nsrc:", argopy.__file__, 
          "\nbranch:", git.Repo(search_parent_directories=True).active_branch.name, 
          "\noptions:", OPTIONS)
else:
    sys.path.insert(0, os.path.abspath('..'))

import xarray as xr
# xr.set_options(display_style="html");
xr.set_options(display_style="text");

In [None]:
import argopy
from argopy import DataFetcher as ArgoDataFetcher

# Performance

## Cache

### Caching data

- with **argopy** global options:

```python
argopy.set_options(cachedir='mycache_folder')
```

- in a temporary context:

```python
with argopy.set_options(cachedir='mycache_folder'):
    ds = ArgoDataFetcher(cache=True).profile(6902746, 34).to_xarray()
```

- when instantiating the data fetcher:

```python
ds = ArgoDataFetcher(cache=True, cachedir='mycache_folder').profile(6902746, 34).to_xarray()
```

Specifyng a cache directory at the fetcher level will ensure

### Clearing the cache

Cached data have an expiration time of 1 day.

If you want to manuallt clear your cache folder, and/or make sure your data are newly fetched, you can do it at the fetcher level with the ``clear_cache`` method.

Start to fetch data and store them in cache:

```python
fetcher = ArgoDataFetcher(cache=True, cachedir='mycache_folder').profile(6902746, 34)
fetcher.to_xarray();
```

Fetched data are in the local cache folder:

```python
os.listdir('mycache_folder')
```

where we see one hash entries the newly fetched data and the cache registry file ``cache``.

We can then fetch something else:

```python
fetcher2 = ArgoDataFetcher(cache=True, cachedir='mycache_folder').profile(1901393, 1)
fetcher2.to_xarray();
```

All fetched data are now cached in 'mycache_folder':

```python
os.listdir('mycache_folder')
```

Note the new hash file from the ``fetcher2`` data.

We can safely clear the cache from the first fetcher data:

```python
fetcher.clear_cache()
```

```python
os.listdir('mycache_folder')
```

By using the fetcher level clear cache, you make sure that only data fetched with it are removed, while other fetched data (with other fetchers for instance) will stay in place.

If you want to clear the entire cache folder, whatever the fetcher used, do it at the package level with:

```python
argopy.clear_cache()
```

```python
os.listdir('mycache_folder')

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-13-6726e674f21f> in <module>
----> 1 os.listdir('mycache_folder')

FileNotFoundError: [Errno 2] No such file or directory: 'mycache_folder'
```

## Parallel fetching

Sometimes you may find that your request takes a long time to fetch, or simply does not even succeed. You can then try to let argopy chunks your request into smaller pieces and have it fetched in parallel for you. This is done with the argument ``parallel`` of the data fetcher and can be tuned using options ``chunks`` and ``chunksize``.

This goes by default like this:

In [None]:
# Define the box to load (large enough to trigger chunking):
box = [-60, -30, 40.0, 60.0, 0.0, 100.0, "2007-01-01", "2007-04-01"]

# Instantiate a parallel fetcher:
loader_par = ArgoDataFetcher(src='erddap', parallel=True).region(box)

you can also use the option ``progress`` to display a progress bar during fetching:

In [None]:
loader_par = ArgoDataFetcher(src='erddap', parallel=True, progress=True).region(box)
loader_par

Then, you can fetch data as usual:

In [None]:
%%time
ds = loader_par.to_xarray()

**chunks**

To check how many chunks your request has been split into, you can look at the ``uri`` property of the fetcher, it gives the list of paths toward data:

In [None]:
loader_par.uri

To control chunking, you can use the ``chunks`` option that specify the number of chunks in each of the *direction*, i.e. ``lon``, ``lat``, ``dpt`` and ``time`` for a *box* fetching and ``wmo`` for a *float* or *profile* fetching.

In [None]:
# Create a large box:
box = [-60, -10, 40.0, 60.0 + np.random.randint(0,100,1)[0]/1000, 0.0, 500.0, "2007", "2010"]

# Init parallel fetcher:
loader_par = ArgoDataFetcher(src='erddap', parallel=True, 
                             chunks={'lon': 5}).region(box)
# Number of chunks:
len(loader_par.uri)

This creates 130 chunks, and 5 along the longitudinale direction, as requested. When the ``chunks`` option is not specified for a given *direction*, it relies on auto-chunking using pre-defined chunk maximum sizes. In the case above, this explains why we have 130 and not only 5 chunks.
To chunk the requested along a single direction, set all the others to ``1``:

In [None]:
loader_par = ArgoDataFetcher(src='argovis', parallel=True, 
                             chunks={'lon': 5, 'lat':1, 'dpt':1, 'time':1}).region(box)
len(loader_par.uri)

### Comparison of performances

To compare performance with or without the parallel option, we need to make sure data are not cached on the server side.
To do this, we use a very small random perturbation on the box definition.

In [None]:
%%time
box = [-60, -10, 40.0, 60.0 + np.random.randint(0,100,1)[0]/1000, 0.0, 500.0, "2007", "2010"]
print(box)
ds = ArgoDataFetcher(src='argovis', parallel=False).region(box).to_xarray()

In [None]:
%%time
box = [-60, 0, 40.0, 60.0 + np.random.randint(0,100,1)[0]/1000, 0.0, 500.0, "2007","2010"]
print(box)
ds = ArgoDataFetcher(src='argovis', parallel=True, progress=True).region(box).to_xarray()

**results**: This simple comparison shows that parallel requests are usefull to handle very large region of data but that it provides a significant overhead for regular size requests.