# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle as pkl
import xarray

In [3]:
from backend import data_paths
from backend import evaluation_utils
from backend import gauge_groups_utils
from backend import loading_utils
from backend import metrics_utils
from backend import return_period_metrics

In [4]:
RESTART = True

# Load Data

## Experiments

In [5]:
GOOGLE_EXPERIMENTS = ['kfold_splits']
GLOFAS_EXPERIMENTS = [metrics_utils.GLOFAS_VARIABLE]

## Model Data

### GloFAS Data

In [6]:
all_gauges = gauge_groups_utils.get_full_gauge_group()
# all_gauges = all_gauges[50:100]
print(f'There are {len(all_gauges)} gauges.')

There are 5678 gauges.


In [7]:
glofas_model_runs = loading_utils.load_glofas_model_runs(gauges=all_gauges)

In [9]:
glofas_gauges = set(glofas_model_runs.gauge_id.values)

### Google Data

In [10]:
google_model_runs = loading_utils.load_all_experimental_model_runs(
    gauges=glofas_gauges,
    experiments=GOOGLE_EXPERIMENTS
)

Working on experiment: kfold_splits


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [01:24<00:00, 48.60it/s]


In [12]:
google_gauges = set(google_model_runs[GOOGLE_EXPERIMENTS[0]].gauge_id.values)

### GRDC Data

In [13]:
grdc_observation_data = loading_utils.load_grdc_data()

## Overlapping Gauge Groups

In [14]:
gauges = list(glofas_gauges.intersection(google_gauges))
print(f'There are {len(gauges)} gauges that exist for both models.')

There are 4089 gauges that exist for both models.


# Time Periods

In [15]:
google_validation_time_periods = {
    gauge: ['2014-01-01', '2023-01-01'] for gauge in gauges
}

# Google Model Metrics

In [16]:
# Add observation data to model run xarrays, and delete redundant varaible to save memory.
for experiment in google_model_runs.keys():
    google_model_runs[experiment] = xarray.merge(
        [google_model_runs[experiment], grdc_observation_data])

## Metrics: 2014 - Present

In [None]:
working_path = data_paths.GOOGLE_2014_RETURN_PERIOD_METRICS_DIR
experiments = GOOGLE_EXPERIMENTS
gauge_list = gauges
ds_dict = google_model_runs
evaluation_time_periods = google_validation_time_periods
lead_times = None

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GOOGLE_VARIABLE,
    obs_variable=metrics_utils.OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

In [None]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

In [None]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

## Metrics: 1980 - Present

In [17]:
working_path = data_paths.GOOGLE_1980_RETURN_PERIOD_METRICS_DIR
experiments = GOOGLE_EXPERIMENTS
gauge_list = gauges
ds_dict = google_model_runs
evaluation_time_periods = None
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GOOGLE_VARIABLE,
    obs_variable=metrics_utils.OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

Working on experiment: kfold_splits ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [26:26<00:00,  2.58it/s]


In [18]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

Experiment kfold_splits has 0 missing gauges.


In [19]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7
1.01,0.0,0.202532,,,,,,,
1.01,1.0,0.493671,,,,,,,
1.01,2.0,0.556962,,,,,,,
2.0,0.0,0.424242,,,,,,,
2.0,1.0,0.545455,,,,,,,
2.0,2.0,0.545455,,,,,,,
5.0,0.0,0.538462,,,,,,,
5.0,1.0,0.615385,,,,,,,
5.0,2.0,0.615385,,,,,,,
10.0,0.0,0.166667,,,,,,,


## Delete Variables to Clear Memory

In [None]:
del google_model_runs

# GloFAS

In [22]:
# Merge everything into one large xarray.
# This xarray merge takes ... forever ...
glofas_model_runs = xarray.merge(
    [glofas_model_runs, grdc_observation_data.sel(lead_time=0)])

## Metrics: 2014 - Present

In [None]:
working_path = data_paths.GLOFAS_2014_RETURN_PERIOD_METRICS_DIR
experiments = GLOFAS_EXPERIMENTS
gauge_list = gauges
ds_dict = {metrics_utils.GLOFAS_VARIABLE: glofas_model_runs}
evaluation_time_periods = google_validation_time_periods
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GLOFAS_VARIABLE,
    obs_variable=metrics_utils.UNNORMALIZED_OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

In [None]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

In [None]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

## Metrics: 1980 - Present

In [23]:
working_path = data_paths.GLOFAS_1980_RETURN_PERIOD_METRICS_DIR
experiments = GLOFAS_EXPERIMENTS
gauge_list = gauges
ds_dict = {metrics_utils.GLOFAS_VARIABLE: glofas_model_runs}
evaluation_time_periods = None
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GLOFAS_VARIABLE,
    obs_variable=metrics_utils.UNNORMALIZED_OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

Working on experiment: glofas_prediction ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [25:50<00:00,  2.64it/s]


In [24]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

Experiment glofas_prediction has 0 missing gauges.


In [25]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7
1.01,0.0,0.165094,,,,,,,
1.01,1.0,0.372642,,,,,,,
1.01,2.0,0.421801,,,,,,,
2.0,0.0,0.242424,,,,,,,
2.0,1.0,0.363636,,,,,,,
2.0,2.0,0.393939,,,,,,,
5.0,0.0,0.272727,,,,,,,
5.0,1.0,0.454545,,,,,,,
5.0,2.0,0.454545,,,,,,,
10.0,0.0,0.0,,,,,,,


# Collect Return Period Metrics in Pickle Files

In [26]:
_DATASET_RETURN_PERIOD_METRICS_PATH = {
    'google_2014': data_paths.GOOGLE_2014_RETURN_PERIOD_METRICS_DIR,
    'google_1980': data_paths.GOOGLE_1980_RETURN_PERIOD_METRICS_DIR,
    'glofas_2014': data_paths.GLOFAS_2014_RETURN_PERIOD_METRICS_DIR,
    'glofas_1980': data_paths.GLOFAS_1980_RETURN_PERIOD_METRICS_DIR,
}

In [27]:
from backend import data_paths

precisions_by_lead_time = {}
recalls_by_lead_time = {}

precisions_by_return_period = {}
recalls_by_return_period = {}

loading_utils.create_remote_folder_if_necessary(data_paths.CONCATENATED_RETURN_PERIOD_DICTS_DIR)

for dataset, data_path in _DATASET_RETURN_PERIOD_METRICS_PATH.items():

    print(f'Working on {dataset} ...')

    file_path = data_paths.CONCATENATED_RETURN_PERIOD_DICTS_DIR / f'{dataset}_return_period_dicts.pkl'

    if 'glofas' in dataset:
        experiments = GLOFAS_EXPERIMENTS
    else:
        experiments = GOOGLE_EXPERIMENTS

    precisions_by_lead_time[dataset] = evaluation_utils.load_return_period_metrics(
        base_path=data_path,
        experiments=experiments,
        gauges=gauges,
        metric='precision'
    )
    recalls_by_lead_time[dataset] = evaluation_utils.load_return_period_metrics(
        base_path=data_path,
        experiments=experiments,
        gauges=gauges,
        metric='recall'
    )

    with open(file_path, 'wb') as f:
        pkl.dump(
            [
                precisions_by_lead_time[dataset],
                recalls_by_lead_time[dataset],
            ], f
        )

    print(f'Finished with {dataset}. \n')

Working on google_2014 ...
Working on experiment kfold_splits ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:01<00:00, 3046.17it/s]


Working on experiment kfold_splits ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:01<00:00, 3073.56it/s]


Finished with google_2014. 

Working on google_1980 ...
Working on experiment kfold_splits ...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:18<00:00, 221.14it/s]


Working on experiment kfold_splits ...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:17<00:00, 237.82it/s]


Finished with google_1980. 

Working on glofas_2014 ...
Working on experiment glofas_prediction ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:01<00:00, 3085.14it/s]


Working on experiment glofas_prediction ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:01<00:00, 3081.17it/s]


Finished with glofas_2014. 

Working on glofas_1980 ...
Working on experiment glofas_prediction ...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:17<00:00, 236.95it/s]


Working on experiment glofas_prediction ...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [00:17<00:00, 237.54it/s]


Finished with glofas_1980. 

