# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import xarray
import pathlib
from glob import glob
from tqdm import tqdm

In [28]:
from backend import data_paths
from backend import loading_utils

# Gauges

In [20]:
with open(data_paths.FULL_GAUGE_GROUP_FILE, 'rt') as f:
    lines = f.readlines()
gauges = [gauge.split('\n')[0].split('_')[1] for gauge in lines]
print(f'There are {len(gauges)} in the gauge group.')

There are 5678 in the gauge group.


# Collect all GRDF Download Files

In [21]:
all_grdc_files = glob(str(data_paths.GRDC_DATA_DOWNLOAD_DIRECTORY / '*.nc'))
all_grdc_files

['/home/gsnearing/data/grdc_data/GRDC-Daily-9.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-2.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-6.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-7.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-3.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-8.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-1.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-5.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily-4.nc',
 '/home/gsnearing/data/grdc_data/GRDC-Daily.nc']

In [22]:
xrs = []
number_of_gauges = 0
for f in tqdm(all_grdc_files):
    xr = xarray.open_dataset(f)
    xrs.append(xr)
    number_of_gauges += len(xr.id.values)
print(f'There are {number_of_gauges} gauges in the GRDC downloads.')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.24it/s]


14220

# Concatenate into a Single Xarray

In [26]:
# concatenate
full_xr = xarray.concat(xrs, dim='id')

# drop any duplicate basins
full_xr = full_xr.drop_duplicates(dim='id', keep='first')

# check for missing gauges
gauges_in_xr = [str(gauge).zfill(7) for gauge in full_xr.id.values]
print(f'There are {len(gauges_in_xr)} gauges in the concatenated xarray.')
print(f'There are {len(set(gauges_in_xr))} unique gauges in the concatenated xarray.')

missing_gauges = [gauge for gauge in gauges if gauge not in gauges_in_xr]
len(f'There are {len(missing_gauges)} gauges missing from the xarray.')

# print missing gauges
print('The missing gauges are:', [int(gauge) for gauge in missing_gauges])

# grab all gauges from xarray that are in the gauge group
present_gauges = list(set(gauges) - set(missing_gauges))
present_gauges = [int(gauge) for gauge in present_gauges]
sliced_xr = full_xr.sel({'id': present_gauges})
print(f'There are {len(sliced_xr.id)} gauges in the final xarray.')

# grab 1980 to present
timeslice = sliced_xr.time.values
sliced_xr = sliced_xr.sel({'time': timeslice[63551:]})
print(f'There time period is {sliced_xr.time.values[0]} to {sliced_xr.time.values[-1]}.')

There are 8550 gauges in the concatenated xarray.
There are 8550 unique gauges in the concatenated xarray.
The missing gauges are: [1159303, 1160440, 1160772, 6503290, 6730500, 6836100, 6836190, 6854100]
There are 5670 gauges in the final xarray.
There time period is 1979-12-31T00:00:00.000000000 to 2023-08-30T00:00:00.000000000.


# Save to NetCDF File

In [29]:
loading_utils.create_remote_folder_if_necessary(data_paths.GRDC_DATA_FILE.parent)
sliced_xr.to_netcdf(data_paths.GRDC_DATA_FILE)