# Imports

In [1]:
import datetime
import numpy as np
import os
import shutil
import tqdm
import xarray

In [3]:
from backend import loading_utils
from backend import data_paths
from backend import gauge_groups_utils
from backend import return_period_metrics  
from backend import metrics_utils
from backend import evaluation_utils

In [4]:
# This flag should be set to True if you are loading GRDC daily streamflow data
# from a file that you got from the GRDC directly.
USE_RAW_GRDC_DATA = True

# Metadata

## Gauge Groups

### Full Gauge Group

In [5]:
gauges = gauge_groups_utils.get_full_gauge_group()
print(f'There are {len(gauges)} gauges.')

There are 5678 gauges.


### GloFAS v3 Benchmarking Gauge Group

In [6]:
# Read GloFAS Metadata file.
glofas_metadata = loading_utils.load_glofas_metadata_file()
glofas_gauges = [gauge for gauge in glofas_metadata.index]

# Find gauge group intersection.
benchmarking_v3_gauges = list(set(gauges).intersection(glofas_gauges))
print(f'There are {len(benchmarking_v3_gauges)} v3 benchmarking gauges.')

There are 778 v3 benchmarking gauges.


### GloFAS v4 Benchmarking Gauge Group### GloFAS v4

In [7]:
# Read GloFAS Metadata file.
glofas_v4_metadata = loading_utils.load_glofas_v4_metadata_file()
glofas_v4_gauges = [gauge for gauge in glofas_v4_metadata.index]

# Find gauge group intersection.
benchmarking_v4_gauges = list(set(gauges).intersection(glofas_v4_gauges))
print(f'There are {len(benchmarking_v4_gauges)} v4 benchmarking gauges.')

There are 1144 v4 benchmarking gauges.


In [8]:
gauges = [6984500, 6987050, 5708145, 6984800, 6998400, 6337400, 4150330, 6984700, 1591408, 4121400, 1837410]
gauges = [f'GRDC_{gauge}' for gauge in gauges]
benchmarking_v3_gauges = list(set(benchmarking_v3_gauges).intersection(set(gauges)))
benchmarking_v4_gauges = list(set(benchmarking_v4_gauges).intersection(set(gauges)))
gauges

['GRDC_6984500',
 'GRDC_6987050',
 'GRDC_5708145',
 'GRDC_6984800',
 'GRDC_6998400',
 'GRDC_6337400',
 'GRDC_4150330',
 'GRDC_6984700',
 'GRDC_1591408',
 'GRDC_4121400',
 'GRDC_1837410']

## Validation Time Periods

### GloFAS v3

In [9]:
glofas_v3_validation_time_periods = {
    gauge: [
        glofas_metadata.loc[gauge, 'Validation_Start'],
        glofas_metadata.loc[gauge, 'Validation_End']
    ] for gauge in benchmarking_v3_gauges
}

glofas_v3_validation_time_periods.update(
    {
        gauge: None
        for gauge in gauges if gauge not in benchmarking_v3_gauges        
    }
)

In [10]:
# Ensure that all start and end dates are within the Google model window.
out_of_bounds_gauges = []
for gauge in benchmarking_v3_gauges:
  if datetime.datetime.strptime(glofas_v3_validation_time_periods[gauge][0], '%m/%d/%y %H:%M') < datetime.datetime(1980, 1, 1):
    out_of_bounds_gauges.append(gauge)
  if datetime.datetime.strptime(glofas_v3_validation_time_periods[gauge][1], '%m/%d/%y %H:%M') > datetime.datetime(2022, 1, 1):
    out_of_bounds_gauges.append(gauge)

print(f'There are {len(out_of_bounds_gauges)} gauges with dates that are out of bounds.')

There are 0 gauges with dates that are out of bounds.


### GloFAS v4

In [11]:
v4meta = loading_utils.load_glofas_v4_metadata_file()

def _get_v4_val_period(gauge):
  try:
    end_date = pd.to_datetime(v4meta.loc[gauge, 'Calib_start (Split Date)'].strip(), format='%d/%m/%Y %H:%M')
    start_date = pd.to_datetime(v4meta.loc[gauge, 'Start Obs'].strip(), format='%d/%m/%Y %H:%M')
    return [start_date, end_date]
  except:
    return None

glofas_v4_validation_time_periods = {gauge: _get_v4_val_period(gauge) for gauge in benchmarking_v4_gauges}

glofas_v4_validation_time_periods.update(
    {
        gauge: None
        for gauge in gauges if gauge not in benchmarking_v4_gauges        
    }
)

### Google Model

In [12]:
google_validation_time_periods = {
    gauge: ['2014-01-01', '2023-01-01'] for gauge in gauges
}

# Google Model Metrics

## Load Data: Google Model Runs

In [13]:
google_model_runs = loading_utils.load_all_experimental_model_runs(
  gauges=gauges,
  load_without_grdc=USE_RAW_GRDC_DATA
)

Working on experiment: kfold_splits


100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 34.80it/s]


Working on experiment: continent_splits


100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 54.79it/s]


Working on experiment: climate_splits


100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 58.23it/s]


Working on experiment: full_run


100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 58.81it/s]


In [14]:
# Count missing gauges per experiment.
for experiment in google_model_runs:
  missing_gauges = len(gauges) - len(google_model_runs[experiment].gauge_id)
  print(f'There are {missing_gauges} (out of {len(gauges)}) missing gauges in experiemnt {experiment}.')

There are 0 (out of 11) missing gauges in experiemnt kfold_splits.
There are 0 (out of 11) missing gauges in experiemnt continent_splits.
There are 0 (out of 11) missing gauges in experiemnt climate_splits.
There are 0 (out of 11) missing gauges in experiemnt full_run.


## Load Data: GRDC Observation Data

In [15]:
if USE_RAW_GRDC_DATA:
  grdc_observation_data = loading_utils.load_grdc_data()
  unnormalized_grdc_observation_data = grdc_observation_data[metrics_utils.UNNORMALIZED_OBS_VARIABLE].rename(metrics_utils.OBS_VARIABLE)
  for experiment in google_model_runs.keys():
    google_model_runs[experiment] = xarray.merge(
        [google_model_runs[experiment], grdc_observation_data[metrics_utils.OBS_VARIABLE]])
  del grdc_observation_data

else:
  experiment = list(google_model_runs.keys())[0]
  unnormalized_grdc_observation_data = loading_utils.unnormalize_observation(
      normalized_discharge=google_model_runs[experiment][metrics_utils.OBS_VARIABLE])
  unnormalized_grdc_observation_data = unnormalized_grdc_observation_data.rename(metrics_utils.OBS_VARIABLE).sel(lead_time=0) 

## Metrics: 2014 - Present

In [None]:
# RESTART = False

# working_path = data_paths.GOOGLE_2014_RETURN_PERIOD_METRICS_DIR
# experiments = data_paths.EXPERIMENTS
# gauge_list = gauges
# ds_dict = google_model_runs
# evaluation_time_periods = google_validation_time_periods
# lead_times = None

# missing_gauges = return_period_metrics.compute_metrics(
#     restart=RESTART,
#     working_path=working_path,
#     experiments=experiments,
#     gauge_list=gauge_list,
#     sim_variable=metrics_utils.GOOGLE_VARIABLE,
#     ds_dict=ds_dict,
#     evaluation_time_periods=evaluation_time_periods,
#     lead_times=lead_times
# )

# for experiment in experiments:
#   print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

# metrics = metrics_utils.load_metrics_df(
#     filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
# metrics

## Metrics: 1980 - Present

In [None]:
# RESTART = False

# working_path = data_paths.GOOGLE_1980_RETURN_PERIOD_METRICS_DIR
# experiments = data_paths.EXPERIMENTS
# gauge_list = gauges
# ds_dict = google_model_runs
# evaluation_time_periods = None
# lead_times = [0]

# missing_gauges = return_period_metrics.compute_metrics(
#     restart=RESTART,
#     working_path=working_path,
#     experiments=experiments,
#     gauge_list=gauge_list,
#     sim_variable=metrics_utils.GOOGLE_VARIABLE,
#     ds_dict=ds_dict,
#     evaluation_time_periods=evaluation_time_periods,
#     lead_times=lead_times
# )

# for experiment in experiments:
#   print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

# metrics = metrics_utils.load_metrics_df(
#     filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
# metrics

## Metrics: Glofas v3 Validation Period

In [None]:
# RESTART = False

# working_path = data_paths.GOOGLE_v3_PERIOD_RETURN_PERIOD_METRICS_DIR
# experiments = data_paths.EXPERIMENTS
# gauge_list = benchmarking_v4_gauges
# ds_dict = google_model_runs
# evaluation_time_periods = glofas_v3_validation_time_periods
# lead_times = [0]

# missing_gauges = return_period_metrics.compute_metrics(
#     restart=RESTART,
#     working_path=working_path,
#     experiments=experiments,
#     gauge_list=gauge_list,
#     sim_variable=metrics_utils.GOOGLE_VARIABLE,
#     ds_dict=ds_dict,
#     evaluation_time_periods=evaluation_time_periods,
#     lead_times=lead_times
# )

# for experiment in experiments:
#   print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

# metrics = metrics_utils.load_metrics_df(
#     filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
# metrics

## Metrics: Glofas v4 Validation Period

In [None]:
# RESTART = False

# working_path = data_paths.GOOGLE_v4_PERIOD_RETURN_PERIOD_METRICS_DIR
# experiments = data_paths.EXPERIMENTS
# gauge_list = benchmarking_v4_gauges
# ds_dict = google_model_runs
# evaluation_time_periods = glofas_v4_validation_time_periods
# lead_times = [0]

# missing_gauges = return_period_metrics.compute_metrics(
#     restart=RESTART,
#     working_path=working_path,
#     experiments=experiments,
#     gauge_list=gauge_list,
#     sim_variable=metrics_utils.GOOGLE_VARIABLE,
#     ds_dict=ds_dict,
#     evaluation_time_periods=evaluation_time_periods,
#     lead_times=lead_times
# )

# for experiment in experiments:
#   print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

# metrics = metrics_utils.load_metrics_df(
#     filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
# metrics

## Delete Variables to Clear Memory

In [16]:
del google_model_runs

# GloFAS v3 Reanalysis

## Load Data: GloFAS v3 All GRDC Gauges

In [17]:
glofas_v3_reanalysis_all_grdc_gauges = loading_utils.load_glofas_model_runs(
    gauges=gauges,
    reanalysis=True,
    all_grdc_gauges=True
)

100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 35.21it/s]


In [18]:
# Count missing gauges.
missing_gauges = len(gauges) - len(glofas_v3_reanalysis_all_grdc_gauges.gauge_id)
print(f'There are {missing_gauges} (out of {len(gauges)}) missing gauges in glofas runs.')

There are 2 (out of 11) missing gauges in glofas runs.


In [19]:
# Drop unused coordinates that might contain nans.
glofas_v3_reanalysis_all_grdc_gauges = glofas_v3_reanalysis_all_grdc_gauges.drop(
    [
        'surface',
        'latitude',
        'longitude',
        'valid_time',
    ]
)

In [20]:
# Merge everything into one large xarray.
# This xarray merge takes ... forever ...
merged_dataset_v3_reanalysis_all_grdc_gauges = xarray.merge(
    [glofas_v3_reanalysis_all_grdc_gauges, unnormalized_grdc_observation_data])

# For the reanalysis, only use the nowcast.
merged_dataset_v3_reanalysis_all_grdc_gauges = merged_dataset_v3_reanalysis_all_grdc_gauges.sel(lead_time=[0])

## Metrics: GloFAS v3 Validation Period

In [None]:
# RESTART = True

# working_path = data_paths.GLOFAS_v3_REANALYSIS_v3_PERIOD_RETURN_PERIOD_METRICS_DIR
# experiments = [loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME]
# gauge_list = benchmarking_v3_gauges
# ds_dict = {'glofas_reanalysis': merged_dataset_v3_reanalysis_all_grdc_gauges}
# evaluation_time_periods = glofas_v3_validation_time_periods
# lead_times = [0]

# missing_gauges = return_period_metrics.compute_metrics(
#     restart=RESTART,
#     working_path=working_path,
#     experiments=experiments,
#     gauge_list=gauge_list,
#     sim_variable=loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME,
#     ds_dict=ds_dict,
#     evaluation_time_periods=evaluation_time_periods,
#     lead_times=lead_times
# )

# for experiment in experiments:
#   print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

# metrics = metrics_utils.load_metrics_df(
#     filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
# metrics

## Metrics: 1980 - Present

In [21]:
RESTART = True

working_path = data_paths.GLOFAS_v3_REANALYSIS_1980_RETURN_PERIOD_METRICS_DIR
experiments = [loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME]
gauge_list = gauges
ds_dict = {'glofas_reanalysis': merged_dataset_v3_reanalysis_all_grdc_gauges}
evaluation_time_periods = None
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

for experiment in experiments:
  print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

Working on experiment: glofas_reanalysis ...


100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:18<00:00,  1.66s/it]

Experiment glofas_reanalysis has 0 missing gauges.





Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7
1.01,0.0,0.058824,,,,,,,
1.01,1.0,0.235294,,,,,,,
1.01,2.0,0.235294,,,,,,,
2.0,0.0,0.5,,,,,,,
2.0,1.0,0.5,,,,,,,
2.0,2.0,0.5,,,,,,,
5.0,0.0,0.0,,,,,,,
5.0,1.0,0.0,,,,,,,
5.0,2.0,0.333333,,,,,,,
10.0,0.0,0.0,,,,,,,


## Metrics: 2014 - Present

In [None]:
RESTART = True

working_path = data_paths.GLOFAS_v3_REANALYSIS_2014_RETURN_PERIOD_METRICS_DIR
experiments = [loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME]
gauge_list = gauges
ds_dict = {'glofas_reanalysis': merged_dataset_v3_reanalysis_all_grdc_gauges}
evaluation_time_periods = google_validation_time_periods
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

for experiment in experiments:
  print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics

# GloFAS v4 Reanalysis

## Load Data: GloFAS v4 Benchmarking Gauges

In [None]:
glofas_v4_reanalysis_benchmarking_gauges = loading_utils.load_glofas_model_runs(
    gauges=benchmarking_v4_gauges,
    v4=True,
)

In [None]:
# Count missing gauges.
missing_gauges = len(benchmarking_v4_gauges) - len(glofas_v4_reanalysis_benchmarking_gauges.gauge_id)
print(f'There are {missing_gauges} (out of {len(benchmarking_v4_gauges)}) missing gauges in glofas runs.')

In [None]:
# Drop unused coordinates that might contain nans.
glofas_v4_reanalysis_benchmarking_gauges = glofas_v4_reanalysis_benchmarking_gauges.drop(
    [
        'latitude',
        'longitude',
    ]
)

In [None]:
# Merge everything into one large xarray.
# This xarray merge takes ... forever ...
merged_dataset_v4_reanalysis_benchmarking_gauges = xarray.merge(
    [glofas_v4_reanalysis_benchmarking_gauges, unnormalized_grdc_observation_data])

# For the reanalysis, only use the nowcast.
merged_dataset_v4_reanalysis_benchmarking_gauges = merged_dataset_v4_reanalysis_benchmarking_gauges.sel(lead_time=[0])

## Metrics: GloFAS v4 Benchmarking Period

In [None]:
RESTART = True

working_path = data_paths.GLOFAS_v4_REANALYSIS_v4_PERIOD_RETURN_PERIOD_METRICS_DIR
experiments = [loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME]
gauge_list = benchmarking_v4_gauges
ds_dict = {'glofas_reanalysis': merged_dataset_v4_reanalysis_all_grdc_gauges}
evaluation_time_periods = glofas_v4_validation_time_periods
lead_times = [0]

missing_gauges = return_period_metrics.compute_metrics(
    restart=RESTART,
    working_path=working_path,
    experiments=experiments,
    gauge_list=gauge_list,
    sim_variable=loading_utils.GLOFAS_REANALYSIS_VARIABLE_NAME,
    ds_dict=ds_dict,
    evaluation_time_periods=evaluation_time_periods,
    lead_times=lead_times
)

for experiment in experiments:
  print(f'Experiment {experiment} has {len(missing_gauges[experiment])} missing gauges.')

metrics = metrics_utils.load_metrics_df(
    filepath=working_path / experiment / 'precision' / f'{gauges[0]}.csv')
metrics