In [None]:
import sys
sys.path.append('../python')
from get_vs import locate_vs
from misc import *

import pandas as pd
from pandas import DataFrame
import numpy as np
import os
from tqdm import tqdm
import xarray as xr
import gcsfs
from dask_kubernetes import KubeCluster
from dask.distributed import Client

%matplotlib inline

In [None]:
is_pangeo_data = True # True if in Pangeo binder, False if in laptop

The coordinates of the virtual stations in [Hydroweb](http://hydroweb.theia-land.fr) don't match with the rivers in [HydroSHEDS](http://www.hydrosheds.org). In order to find the corresponding coordinates in HydroSHEDS, we look around the original position for the pixel with the biggest accumulated flow which is bigger than a minimum flow. If no such flow is found, we look further around, until we find one (but not too far away, in which case we just drop the virtual station). The new_lat/new_lon are the coordinates of this pixel, if found.

In [None]:
if not os.path.exists('../data/amazonas/amazonas.pkl'):
    df = locate_vs('../data/amazonas/amazonas.txt', pix_nb=20, acc_min=1_000_000)
    df.to_pickle('../data/amazonas/amazonas.pkl')
else:
    df = pd.read_pickle('../data/amazonas/amazonas.pkl')

In [None]:
sub_latlon = df[['new_lat', 'new_lon']].dropna().values
print(f'Out of {len(df)} virtual stations in Hydroweb, {len(sub_latlon)} could be found in HydroSHEDS.')

The following coordinates are duplicated because some virtual stations fall inside the same pixels.

In [None]:
rm_latlon = [(-4.928333333333334, -62.733333333333334), (-3.8666666666666667, -61.6775)]

In [None]:
df_ll = df[['new_lat', 'new_lon']].dropna()
duplicated = df_ll[df_ll.duplicated(keep=False)]
duplicated

In [None]:
assert len(duplicated) / 2 == len(rm_latlon)

In [None]:
# all the subbasins in the hydrologic partition (including virtual stations)
if is_pangeo_data:
    fs = gcsfs.GCSFileSystem(project='pangeo-data')
    labels = [os.path.basename(path[:-1]) for path in fs.ls('pangeo-data/ws_mask/amazonas')]
else:
    labels = os.listdir('ws_mask/amazonas')
len(labels)

In [None]:
labels_without_vs = list(labels)
labels_with_vs = []
for l in tqdm(labels):
    if is_pangeo_data:
        ds = xr.open_zarr(gcsfs.GCSMap(f'pangeo-data/ws_mask/amazonas/{l}'))
    else:
        ds = xr.open_zarr(f'ws_mask/amazonas/{l}')
    da = ds['mask']
    olat, olon = da.attrs['outlet']
    idx = df_ll[(olat-0.25/1200<df_ll.new_lat.values) & (df_ll.new_lat.values<olat+0.25/1200) & (olon-0.25/1200<df_ll.new_lon.values) & (df_ll.new_lon.values<olon+0.25/1200)].index.values
    labels_with_vs.append(l)
    if len(idx) > 0:
        labels_without_vs.remove(l)

In [None]:
label_tree = get_label_tree(labels_with_vs)

In [None]:
if is_pangeo_data:
    cluster = KubeCluster(n_workers=10)
    client = Client(cluster)

In [None]:
precipitation = get_precipitation('2014-03-11', '2014-03-12 01:00:00')