# Imports

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import pickle as pkl
import xarray

In [30]:
from backend import data_paths
from backend import evaluation_utils
from backend import gauge_groups_utils
from backend import loading_utils
from backend import metrics_utils
from backend import return_period_metrics

In [8]:
RESTART = True

# Load Data

## Experiments

In [None]:
GOOGLE_EXPERIMENTS = ['kfold_splits']
GLOFAS_EXPERIMENTS = [metrics_utils.GLOFAS_VARIABLE]

## Model Data

### GloFAS Data

In [None]:
all_gauges = gauge_groups_utils.get_full_gauge_group()
# all_gauges = all_gauges[50:100]
print(f'There are {len(all_gauges)} gauges.')

In [None]:
glofas_model_runs = loading_utils.load_glofas_model_runs(gauges=all_gauges)

### Google Data

In [None]:
google_model_runs = loading_utils.load_all_experimental_model_runs(
    gauges=gauges,
    experiments=GOOGLE_EXPERIMENTS
)

### GRDC Data

In [None]:
grdc_observation_data = loading_utils.load_grdc_data()

## Overlapping Gauge Groups

In [33]:
glofas_gauges = set(glofas_model_runs.gauge_id.values)
google_gauges = set(google_model_runs[GOOGLE_EXPERIMENTS[0]].gauge_id.values)
gauges = list(glofas_gauges.intersection(google_gauges))
print(f'There are {len(gauges)} gauges that exist for both models.')

Working on experiment: kfold_splits


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4089/4089 [01:21<00:00, 50.45it/s]


AttributeError: 'set' object has no attribute 'intersect'

# Time Periods

In [35]:
google_validation_time_periods = {
    gauge: ['2014-01-01', '2023-01-01'] for gauge in gauges
}

# Google Model Metrics

In [36]:
# Count missing gauges per experiment.
for experiment in google_model_runs:
    missing_gauges = set(gauges) - set(google_model_runs[experiment].gauge_id.values)
    print(f'There are {len(missing_gauges)} (out of {len(gauges)}) missing gauges in experiemnt {experiment}.')

There are 0 (out of 4089) missing gauges in experiemnt kfold_splits.


In [38]:
# Add observation data to model run xarrays, and delete redundant varaible to save memory.
for experiment in google_model_runs.keys():
    google_model_runs[experiment] = xarray.merge(
        [google_model_runs[experiment], grdc_observation_data])

## Metrics: 2014 - Present

In [None]:
working_path = data_paths.GOOGLE_2014_RETURN_PERIOD_METRICS_DIR
experiments = GOOGLE_EXPERIMENTS
gauge_list = gauges
ds_dict = google_model_runs
evaluation_time_periods = google_validation_time_periods
lead_times = None

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GOOGLE_VARIABLE,
    obs_variable=metrics_utils.OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

In [None]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

In [None]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

## Metrics: 1980 - Present

In [None]:
working_path = data_paths.GOOGLE_1980_RETURN_PERIOD_METRICS_DIR
experiments = GOOGLE_EXPERIMENTS
gauge_list = gauges
ds_dict = google_model_runs
evaluation_time_periods = None
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GOOGLE_VARIABLE,
    obs_variable=metrics_utils.OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

Working on experiment: kfold_splits ...


 23%|█████████████████████████████████████████████████▉                                                                                                                                                                    | 955/4089 [06:18<20:22,  2.56it/s]

In [None]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

In [None]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

## Delete Variables to Clear Memory

In [None]:
del google_model_runs

# GloFAS

## Load Data: GloFAS

In [18]:
glofas_model_runs = loading_utils.load_glofas_model_runs(gauges=gauges)

In [None]:
# Count missing gauges.
missing_gauges = len(gauges) - len(glofas_model_runs.gauge_id)
print(f'There are {missing_gauges} (out of {len(gauges)}) missing gauges in glofas runs.')

glofas_gauges = list(set(glofas_model_runs.gauge_id.values))

In [None]:
# Merge everything into one large xarray.
# This xarray merge takes ... forever ...
glofas_model_runs = xarray.merge(
    [glofas_model_runs, grdc_observation_data.sel(lead_time=0)])

## Metrics: 2014 - Present

In [None]:
working_path = data_paths.GLOFAS_2014_RETURN_PERIOD_METRICS_DIR
experiments = GLOFAS_EXPERIMENTS
gauge_list = gauges
ds_dict = {metrics_utils.GLOFAS_VARIABLE: glofas_model_runs}
evaluation_time_periods = google_validation_time_periods
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GLOFAS_VARIABLE,
    obs_variable=metrics_utils.UNNORMALIZED_OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

In [None]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

In [None]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

## Metrics: 1980 - Present

In [None]:
working_path = data_paths.GLOFAS_1980_RETURN_PERIOD_METRICS_DIR
experiments = GLOFAS_EXPERIMENTS
gauge_list = gauges
ds_dict = {metrics_utils.GLOFAS_VARIABLE: glofas_model_runs}
evaluation_time_periods = None
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=metrics_utils.GLOFAS_VARIABLE,
    obs_variable=metrics_utils.UNNORMALIZED_OBS_VARIABLE,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

In [None]:
for experiment in experiments:
    print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

In [None]:
metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

# Collect Return Period Metrics in Pickle Files

In [None]:
_DATASET_RETURN_PERIOD_METRICS_PATH = {
    'google_2014': data_paths.GOOGLE_2014_RETURN_PERIOD_METRICS_DIR,
    'google_1980': data_paths.GOOGLE_1980_RETURN_PERIOD_METRICS_DIR,
    'glofas_2014': data_paths.GLOFAS_2014_RETURN_PERIOD_METRICS_DIR,
    'glofas_1980': data_paths.GLOFAS_1980_RETURN_PERIOD_METRICS_DIR,
}

In [None]:
from backend import data_paths

precisions_by_lead_time = {}
recalls_by_lead_time = {}

precisions_by_return_period = {}
recalls_by_return_period = {}

loading_utils.create_remote_folder_if_necessary(data_paths.CONCATENATED_RETURN_PERIOD_DICTS_DIR)

for dataset, data_path in _DATASET_RETURN_PERIOD_METRICS_PATH.items():

    print(f'Working on {dataset} ...')

    file_path = data_paths.CONCATENATED_RETURN_PERIOD_DICTS_DIR / f'{dataset}_return_period_dicts.pkl'

    if 'glofas' in dataset:
        experiments = GLOFAS_EXPERIMENTS
    else:
        experiments = GOOGLE_EXPERIMENTS

    precisions_by_lead_time[dataset] = evaluation_utils.load_return_period_metrics(
        base_path=data_path,
        experiments=experiments,
        gauges=gauges,
        metric='precision'
    )
    recalls_by_lead_time[dataset] = evaluation_utils.load_return_period_metrics(
        base_path=data_path,
        experiments=experiments,
        gauges=gauges,
        metric='recall'
    )

    with open(file_path, 'wb') as f:
        pkl.dump(
            [
                precisions_by_lead_time[dataset],
                recalls_by_lead_time[dataset],
            ], f
        )

    print(f'Finished with {dataset}. \n')