In [None]:
import sys
sys.path.append('../python')
from get_vs import *
from misc import *
from models import *
from mcmc import *

import random
import subprocess
import pickle
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
from tqdm import tqdm
import xarray as xr
import gcsfs
from dask_kubernetes import KubeCluster
from dask.distributed import Client

%matplotlib inline

In [None]:
is_pangeo_data = True # True if in Pangeo binder, False if in laptop
if is_pangeo_data:
    n_workers = 10
    cluster = KubeCluster(n_workers=n_workers)
    client = Client(cluster)
else:
    cluster = None
cluster

Wait for cluster to be up and running

In [None]:
if is_pangeo_data:
    def set_worker_env():
        global mcmc
        import sys, os
        cwd = os.getcwd()
        sys.path.append(cwd + '/python')
        import mcmc
    client.run(set_worker_env)

The coordinates of the virtual stations in [Hydroweb](http://hydroweb.theia-land.fr) don't match with the rivers in [HydroSHEDS](http://www.hydrosheds.org). In order to find the corresponding coordinates in HydroSHEDS, we look around the original position for the pixel with the biggest accumulated flow which is bigger than a minimum flow. If no such flow is found, we look further around, until we find one (but not too far away, in which case we just drop the virtual station). The new_lat/new_lon are the coordinates of this pixel, if found.

In [None]:
if not os.path.exists('../data/amazonas/amazonas.pkl'):
    df = locate_vs('../data/amazonas/amazonas.txt', pix_nb=20, acc_min=1_000_000)
    df.to_pickle('../data/amazonas/amazonas.pkl')
else:
    df = pd.read_pickle('../data/amazonas/amazonas.pkl')

In [None]:
sub_latlon = df[['new_lat', 'new_lon']].dropna().values
print(f'Out of {len(df)} virtual stations in Hydroweb, {len(sub_latlon)} could be found in HydroSHEDS.')

The following coordinates are duplicated because some virtual stations fall inside the same pixels.

In [None]:
rm_latlon = [(-4.928333333333334, -62.733333333333334), (-3.8666666666666667, -61.6775)]

In [None]:
df_ll = df[['new_lat', 'new_lon']].dropna()
duplicated = df_ll[df_ll.duplicated(keep=False)]
duplicated

In [None]:
# all the subbasins in the hydrologic partition (including virtual stations)
#gcs_get_dir('pangeo-data/gross/ws_mask/amazonas', 'ws_mask/amazonas', fs)
#gcs_w_token = gcsfs.GCSFileSystem(project='pangeo-data', token='browser')
if is_pangeo_data:
    fs = gcsfs.GCSFileSystem(project='pangeo-data')
    labels = [os.path.basename(path[:-1]) for path in fs.ls('pangeo-data/gross/ws_mask/amazonas')]
else:
    labels = os.listdir('ws_mask/amazonas')
print('Total number of subbasins:', len(labels))

In [None]:
label_pickle_path = '../data/amazonas/labels.pkl'
if not os.path.exists(label_pickle_path):
    labels_without_vs = list(labels)
    labels_with_vs = {}
    for label in tqdm(labels):
        ds = xr.open_zarr(f'ws_mask/amazonas/{label}')
        da = ds['mask']
        olat, olon = da.attrs['outlet']
        idx = df_ll[(olat-0.25/1200<df_ll.new_lat.values) & (df_ll.new_lat.values<olat+0.25/1200) & (olon-0.25/1200<df_ll.new_lon.values) & (df_ll.new_lon.values<olon+0.25/1200)].index.values
        if len(idx) > 0:
            labels_without_vs.remove(label)
            labels_with_vs[label] = list(df.iloc[idx].station.values)
    with open(label_pickle_path, 'wb') as f:
        pickle.dump((labels_with_vs, labels_without_vs), f)
else:
    with open(label_pickle_path, 'rb') as f:
        labels_with_vs, labels_without_vs = pickle.load(f)

In [None]:
labels_with_vs_tree = get_label_tree(list(labels_with_vs))

In [None]:
d0, d1 = '2000-03-01 12:00:00', '2018-12-31'
x_range = ((0.1, 1e4), (-1, 1), (0.1, 1e3), (0.1, 1e2))
x_start = [[random.uniform(*rng) for rng in x_range] for _ in range(n_workers)]
sample_nb = 1_000 # number of samples generated by MCMC
burnin = sample_nb // 10 # number of burnin samples
warmup = 30 * 24 * 2 # one month in 30min steps

q_ensemble = {}
#for ws_i in range(ws_nb):
for label in labels_with_vs_tree:
    # get basin's labels
    ws_labels = startswith_label(label, labels)
    # get basin's mask
    os.makedirs('ws_mask/amazonas', exist_ok=True)
    for l in ws_labels:
        if not os.path.exists(f'ws_mask/amazonas/{l}'):
            fs = gcsfs.GCSFileSystem(project='pangeo-data')
            gcs_get_dir(f'pangeo-data/gross/ws_mask/amazonas/{l}', f'ws_mask/amazonas/{l}', fs)
    da_mask = get_mask('ws_mask/amazonas', ws_labels)
    subprocess.check_call('rm -rf mask'.split())
    da_mask.to_dataset(name='mask').to_zarr('mask')
    # get basin's precipitation and PET, and water level at virtual station
    if is_pangeo_data:
        p = get_precipitation(d0, d1, 'mask')
        e = get_pet(d0, d1, 'mask')
    else:
        peq = pd.read_pickle('peq.pkl')
        p = peq.p
        e = peq.e
    he = get_waterlevel(d0, d1, labels_with_vs[label][0]) # there might be several stations
    peq = DataFrame()
    peq['p'] = p
    peq['e'] = e
    peq['h_obs'] = he.h
    peq['h_err'] = he.e
    is_source_basin = True
    if is_source_basin:
        area_head = 1
        area_tail = 0
        # prior probability distribution is uniform for head basin
        x0 = x_start
        x_prior = [uniform_density(*r) for r in x_range]
        lnprob_prior = [lnprob_from_density(p, *r) for p, r in zip(x_prior, x_range)]
    else:
        area_head = ws_i
        area_tail = 1
        x0 = [xy[0][np.argmax(xy[1])] for xy in x_head]
        x0 += [d_start] + x_start
        # prior probability distribution is uniform for tail basin
        lnprob_prior = [lnprob_from_density(p, *r) for p, r in zip(x_head, x_range)]
        lnprob_prior += [lnprob_from_density(uniform_density(*d_range), *d_range)]
        x_tail = [uniform_density(*r) for r in x_range]
        lnprob_prior += [lnprob_from_density(p, *r) for p, r in zip(x_tail, x_range)]
    lnprob = get_lnprob(gr4hh, warmup, peq, lnprob_prior, area_head, area_tail)
    # run MCMC
    futures = [client.submit(mcmc.Sampler, x0[i], lnprob, actor=True) for i in range(n_workers)]
    samplers = [future.result() for future in futures]
    futures = [sampler.run(sample_nb, burnin) for sampler in samplers]
    results = [future.result() for future in futures]
    for i, result in enumerate(results):
        if i == 0:
            samples, q_sim = result
        else:
            samples = np.vstack((samples, result[0]))
            q_sim += result[1]
    sys.exit()
    #sampler = Sampler(x0, lnprob)
    #samples, q_sim = sampler.run(sample_nb, burnin)
    # get simulated streamflow and uncertainty
    q_sim = np.array(q_sim)
    q_ensemble[f'f{ws_i}'] = q_sim
    # plot updated streamflow
    #plot_series(ensemble=q_sim[:, -days:], true=df[f'q_true_{ws_i}'].values[-days:], title=f'Streamflow at the outlet of $B_{ws_i}$')
    if False:#(ws_i > 0) and (ws_i < ws_nb - 1):
        # reduce dual model to single model
        peq = df[['p', 'e']]
        x_prior = [uniform_density(*r) for r in x_range]
        lnprob_prior = [lnprob_from_density(p, *r) for p, r in zip(x_prior, x_range)]
        q_kde = np.empty((2, n_kde, q_sim.shape[1]))
        for i in range(q_kde.shape[2]):
            q_kde[:, :, i] = get_kde(q_sim[:, i], nb=n_kde)
        lnprob = get_lnprob(peq, lnprob_prior, 1, 0, q_kde)
        x0 = x_start
        sampler = mcmc.Sampler(x0, lnprob)
        samples, q_sim = sampler.run(sample_nb, burnin)
    x_head = [get_kde(samples[:, i]) for i in range(4)]