# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle as pkl
import xarray

In [3]:
from backend import data_paths
from backend import evaluation_utils
from backend import gauge_groups_utils
from backend import loading_utils
from backend import metrics_utils
from backend import return_period_metrics

In [4]:
RESTART = True

# Metadata

## Gauge Groups

In [5]:
gauges = gauge_groups_utils.get_full_gauge_group()
print(f'There are {len(gauges)} gauges.')

There are 5678 gauges.


In [6]:
gauges = gauges[50:100]

# Google Model Metrics

## Load Data: Google Model Runs

In [7]:
google_model_runs = loading_utils.load_all_experimental_model_runs(
  gauges=gauges,
)

Working on experiment: kfold_splits


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [02:16<00:00, 41.59it/s]


Working on experiment: continent_splits


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [02:10<00:00, 43.55it/s]


Working on experiment: climate_splits


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [03:17<00:00, 28.72it/s]


Working on experiment: full_run


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [03:18<00:00, 28.57it/s]


In [8]:
# Count missing gauges per experiment.
for experiment in google_model_runs:
    missing_gauges = set(gauges) - set(google_model_runs[experiment].gauge_id.values)
    print(f'There are {len(missing_gauges)} (out of {len(gauges)}) missing gauges in experiemnt {experiment}.')

There are 0 (out of 5678) missing gauges in experiemnt kfold_splits.
There are 0 (out of 5678) missing gauges in experiemnt continent_splits.
There are 0 (out of 5678) missing gauges in experiemnt climate_splits.
There are 0 (out of 5678) missing gauges in experiemnt full_run.


## Load Data: GRDC Observation Data

In [9]:
# Load GRDC observation data.
grdc_observation_data = loading_utils.load_grdc_data()

# Add observation data to model run xarrays, and delete redundant varaible to save memory.
for experiment in google_model_runs.keys():
    google_model_runs[experiment] = xarray.merge(
        [google_model_runs[experiment], grdc_observation_data])

## Metrics: 2014 - Present

In [10]:
google_validation_time_periods = {
    gauge: ['2014-01-01', '2023-01-01'] for gauge in gauges
}

In [11]:
working_path = data_paths.GOOGLE_2014_RETURN_PERIOD_METRICS_DIR
experiments = data_paths.EXPERIMENTS
gauge_list = gauges
ds_dict = google_model_runs
evaluation_time_periods = google_validation_time_periods
lead_times = None

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GOOGLE_VARIABLE,
    obs_variable=metrics_utils.OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

Working on experiment: kfold_splits ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [4:34:16<00:00,  2.90s/it]


Working on experiment: continent_splits ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [4:34:32<00:00,  2.90s/it]


Working on experiment: climate_splits ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [4:38:21<00:00,  2.94s/it]


Working on experiment: full_run ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [4:37:12<00:00,  2.93s/it]


In [12]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

Experiment kfold_splits has 0 missing gauges.
Experiment continent_splits has 0 missing gauges.
Experiment climate_splits has 0 missing gauges.
Experiment full_run has 0 missing gauges.


In [13]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7
1.01,0.0,,,,,,,,
1.01,1.0,,,,,,,,
1.01,2.0,,,,,,,,
2.0,0.0,,,,,,,,
2.0,1.0,,,,,,,,
2.0,2.0,,,,,,,,
5.0,0.0,,,,,,,,
5.0,1.0,,,,,,,,
5.0,2.0,,,,,,,,
10.0,0.0,,,,,,,,


## Metrics: 1980 - Present

In [14]:
working_path = data_paths.GOOGLE_1980_RETURN_PERIOD_METRICS_DIR
experiments = data_paths.EXPERIMENTS
gauge_list = gauges
ds_dict = google_model_runs
evaluation_time_periods = None
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GOOGLE_VARIABLE,
    obs_variable=metrics_utils.OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

Working on experiment: kfold_splits ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [37:51<00:00,  2.50it/s]


Working on experiment: continent_splits ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [38:08<00:00,  2.48it/s]


Working on experiment: climate_splits ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [37:38<00:00,  2.51it/s]


Working on experiment: full_run ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [37:02<00:00,  2.55it/s]


In [15]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

Experiment kfold_splits has 0 missing gauges.
Experiment continent_splits has 0 missing gauges.
Experiment climate_splits has 0 missing gauges.
Experiment full_run has 0 missing gauges.


In [16]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7
1.01,0.0,0.018868,,,,,,,
1.01,1.0,0.038462,,,,,,,
1.01,2.0,0.134615,,,,,,,
2.0,0.0,0.125,,,,,,,
2.0,1.0,0.375,,,,,,,
2.0,2.0,0.375,,,,,,,
5.0,0.0,0.0,,,,,,,
5.0,1.0,0.0,,,,,,,
5.0,2.0,0.0,,,,,,,
10.0,0.0,,,,,,,,


## Delete Variables to Clear Memory

In [17]:
del google_model_runs

# GloFAS

## Load Data: GloFAS

In [18]:
glofas_model_runs = loading_utils.load_glofas_model_runs(gauges=gauges)

In [19]:
# Count missing gauges.
missing_gauges = len(gauges) - len(glofas_model_runs.gauge_id)
print(f'There are {missing_gauges} (out of {len(gauges)}) missing gauges in glofas runs.')

There are 1589 (out of 5678) missing gauges in glofas runs.


In [20]:
# Merge everything into one large xarray.
# This xarray merge takes ... forever ...
glofas_model_runs = xarray.merge(
    [glofas_model_runs, grdc_observation_data.sel(lead_time=0)])

## Metrics: 2014 - Present

In [21]:
working_path = data_paths.GLOFAS_2014_RETURN_PERIOD_METRICS_DIR
experiments = [metrics_utils.GLOFAS_VARIABLE]
gauge_list = gauges
ds_dict = {metrics_utils.GLOFAS_VARIABLE: glofas_model_runs}
evaluation_time_periods = google_validation_time_periods
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GLOFAS_VARIABLE,
    obs_variable=metrics_utils.UNNORMALIZED_OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

Working on experiment: glofas_prediction ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [36:29<00:00,  2.59it/s]


In [22]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

Experiment glofas_prediction has 0 missing gauges.


In [23]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7
1.01,0.0,,,,,,,,
1.01,1.0,,,,,,,,
1.01,2.0,,,,,,,,
2.0,0.0,,,,,,,,
2.0,1.0,,,,,,,,
2.0,2.0,,,,,,,,
5.0,0.0,,,,,,,,
5.0,1.0,,,,,,,,
5.0,2.0,,,,,,,,
10.0,0.0,,,,,,,,


## Metrics: 1980 - Present

In [24]:
working_path = data_paths.GLOFAS_1980_RETURN_PERIOD_METRICS_DIR
experiments = [metrics_utils.GLOFAS_VARIABLE]
gauge_list = gauges
ds_dict = {metrics_utils.GLOFAS_VARIABLE: glofas_model_runs}
evaluation_time_periods = None
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GLOFAS_VARIABLE,
    obs_variable=metrics_utils.UNNORMALIZED_OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

Working on experiment: glofas_prediction ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5678/5678 [35:46<00:00,  2.65it/s]


In [25]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

Experiment glofas_prediction has 0 missing gauges.


In [26]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7
1.01,0.0,0.175,,,,,,,
1.01,1.0,0.375,,,,,,,
1.01,2.0,0.5,,,,,,,
2.0,0.0,0.25,,,,,,,
2.0,1.0,0.25,,,,,,,
2.0,2.0,0.5,,,,,,,
5.0,0.0,,,,,,,,
5.0,1.0,,,,,,,,
5.0,2.0,,,,,,,,
10.0,0.0,,,,,,,,


# Collect Return Period Metrics in Pickle Files

In [None]:
precisions_by_lead_time = {}
recalls_by_lead_time = {}

precisions_by_return_period = {}
recalls_by_return_period = {}

loading_utils.create_remote_folder_if_necessary(data_paths.CONCATENATED_RETURN_PERIOD_DICTS_DIR)

for dataset, data_path in _DATASET_RETURN_PERIOD_METRICS_PATH.items():

    print(f'Working on {dataset} ...')

    file_path = data_paths.CONCATENATED_RETURN_PERIOD_DICTS_DIR / f'{dataset}_return_period_dicts.pkl'

    if 'glofas' in dataset:
        experiments = [metrics_utils.GLOFAS_VARIABLE]
    else:
        experiments = data_paths.EXPERIMENTS

    precisions_by_lead_time[dataset] = evaluation_utils.load_return_period_metrics(
        base_path=data_path,
        experiments=experiments,
        gauges=gauges,
        metric='precision'
    )
    recalls_by_lead_time[dataset] = evaluation_utils.load_return_period_metrics(
        base_path=data_path,
        experiments=experiments,
        gauges=gauges,
        metric='recall'
    )

    with open(file_path, 'wb') as f:
        pkl.dump(
            [
                precisions_by_lead_time[dataset],
                recalls_by_lead_time[dataset],
            ], f
        )

    print(f'Finished with {dataset}. \n')