In [1]:
from pathlib import Path
from typing import List, Dict, Union

import os
import pandas as pd
import xarray

from neuralhydrology.datasetzoo.basedataset import BaseDataset
from neuralhydrology.utils.config import Config

In [2]:
os.getcwd()

'/home/sngrj0hn/GitHub/neuralhydrology/examples/03-Adding-Datasets'

In [3]:
# set wd to wkspce root
os.chdir('../../')

In [4]:
class CamelsDE(BaseDataset):
    
    def __init__(self,
                 cfg: Config,
                 is_train: bool,
                 period: str,
                 basin: str = None,
                 additional_features: List[Dict[str, pd.DataFrame]] = [],
                 id_to_int: Dict[str, int] = {},
                 scaler: Dict[str, Union[pd.Series, xarray.DataArray]] = {}):
        
        # Initialize `BaseDataset` class
        super(CamelsDE, self).__init__(cfg=cfg,
                                       is_train=is_train,
                                       period=period,
                                       basin=basin,
                                       additional_features=additional_features,
                                       id_to_int=id_to_int,
                                       scaler=scaler)

    def _load_basin_data(self, basin: str) -> pd.DataFrame:
        """Load timeseries data of one specific basin"""
        raise NotImplementedError

    def _load_attributes(self) -> pd.DataFrame:
        """Load catchment attributes"""
        raise NotImplementedError

### Data loading functions

For all datasets, we implemented the actual data loading (e.g., from the txt or csv files) in separate functions outside of the class so that these functions are usable everywhere. This is useful for example when you want to inspect or visualize the discharge of a particular basin or do anything else with the basin data. These functions are implemented within the same file (since they are specific to each data set) and we use those functions from within the class methods.

So let's start by implementing a function that reads a single basin file of time series data for a given basin identifier.

In [5]:
def load_camels_de_timeseries(data_dir: Path, basin: str) -> pd.DataFrame:
    preprocessed_dir = data_dir / "timeseries"
    
    # make sure the CAMELS-CL data was already preprocessed and per-basin files exist.
    if not preprocessed_dir.is_dir():
        msg = [
            f"No preprocessed data directory found at {preprocessed_dir}."
        ]
        raise FileNotFoundError("".join(msg))
        
    # load the data for the specific basin into a time-indexed dataframe
    basin_file = preprocessed_dir / f"CAMELS_DE_hydromet_timeseries_DE{basin}.csv"
    df = pd.read_csv(basin_file, index_col='date', parse_dates=['date'])
    return df

Most of this should be easy to follow. First we check that the data was already preprocessed and if it wasn't, we throw an appropriate error message. Then we proceed to load the data into a pd.DataFrame and we make sure that the index is converted into a datetime format.

Next, we need a function to load the attributes, which are stored in a file called `1_CAMELScl_attributes.txt`. We assume that this file exist in the root directory of the dataset (such information is useful to add to the docstring!). The dataframe that this function has to return must be basin-indexed with attributes as columns. Furthermore, we accept an optional argument `basins`, which is a list of strings. This list can specify basins of interest and if passed, we only return the attributes for said basins.

In [6]:
def load_camels_de_attributes(data_dir: Path, basins: List[str] = []) -> pd.DataFrame:
    
    attributes_path = data_dir

    if not attributes_path.exists():
        raise FileNotFoundError(f"Attribute folder not found at {attributes_path}")

    txt_files = attributes_path.glob('*_attributes.csv')

    # Read-in attributes into one big dataframe
    dfs = []
    for txt_file in txt_files:
        df_temp = pd.read_csv(txt_file, sep=',', header=0, dtype={'gauge_id': str})
        df_temp = df_temp.set_index('gauge_id')

        dfs.append(df_temp)

    df = pd.concat(dfs, axis=1)


    if basins:
        if any(b not in df.index for b in basins):
            raise ValueError('Some basins are missing static attributes.')
        df = df.loc[basins]

    return df

In [7]:
load_camels_de_attributes(data_dir = Path('./data/camels_de'))

Unnamed: 0_level_0,p_mean,p_seasonality,frac_snow,high_prec_freq,high_prec_dur,high_prec_timing,low_prec_freq,low_prec_dur,low_prec_timing,aquitard_perc,...,flow_perc_complete,slope_fdc,hfd_mean,Q5,Q95,high_q_freq,high_q_dur,low_q_freq,low_q_dur,zero_q_freq
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DE110000,2.97,0.01,0.12,15.23,1.19,djf,202.67,3.71,son,67.06,...,98.072,2.08,151.33,0.28,4.48,3.83,2.07,22.70,6.84,0.00
DE110010,2.87,0.05,0.12,13.30,1.19,jja,178.97,3.72,son,64.11,...,98.240,2.59,145.16,0.00,2.89,37.71,6.27,170.48,31.39,0.39
DE110020,2.54,0.19,0.10,15.22,1.18,jja,212.80,3.74,son,32.41,...,100.000,2.21,161.39,0.19,2.41,1.99,2.45,8.39,6.81,0.00
DE110030,2.45,0.25,0.09,15.23,1.17,jja,213.89,3.74,son,25.30,...,100.000,1.72,166.16,0.26,2.13,0.33,1.92,0.91,9.00,0.00
DE110040,2.61,0.40,0.07,17.16,1.18,jja,223.92,3.75,son,22.35,...,100.000,0.86,178.06,0.55,2.19,0.43,1.11,0.04,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DEG10580,2.29,0.09,0.09,15.20,1.17,jja,220.52,4.01,son,46.60,...,100.000,2.01,155.33,0.24,2.90,2.68,1.54,12.42,6.59,0.00
DEG10590,2.21,0.06,0.10,14.65,1.17,jja,216.96,4.01,son,54.13,...,100.000,2.07,157.14,0.21,2.52,1.62,2.87,8.35,6.55,0.00
DEG10600,1.62,0.15,0.08,17.31,1.17,jja,245.11,4.01,son,30.80,...,84.524,2.71,178.57,0.03,0.79,5.66,3.60,31.28,15.37,0.00
DEG10610,1.80,0.31,0.09,17.32,1.17,jja,242.87,4.01,son,97.87,...,100.000,2.51,168.25,0.07,1.71,16.96,5.37,55.46,6.49,0.00


In [8]:
class CamelsCL(BaseDataset):
    
    def __init__(self,
                 cfg: Config,
                 is_train: bool,
                 period: str,
                 basin: str = None,
                 additional_features: List[Dict[str, pd.DataFrame]] = [],
                 id_to_int: Dict[str, int] = {},
                 scaler: Dict[str, Union[pd.Series, xarray.DataArray]] = {}):
        
        # Initialize `BaseDataset` class
        super(CamelsCL, self).__init__(cfg=cfg,
                                       is_train=is_train,
                                       period=period,
                                       basin=basin,
                                       additional_features=additional_features,
                                       id_to_int=id_to_int,
                                       scaler=scaler)

    def _load_basin_data(self, basin: str) -> pd.DataFrame:
        """Load timeseries data of one specific basin"""
        return load_camels_de_timeseries(data_dir=self.cfg.data_dir, basin=basin)

    def _load_attributes(self) -> pd.DataFrame:
        """Load catchment attributes"""
        return load_camels_de_attributes(self.cfg.data_dir, basins=self.basins)

### Integrating the dataset class into NeuralHydrology

With these few lines of code, you are ready to use a new dataset within the NeuralHydrology framework. The only thing missing is to link the new dataset in the `get_dataset()` function, implemented in `neuralhydrology.datasetzoo.__init__.py`. Again, we removed the doc-string for brevity ([here](https://neuralhydrology.readthedocs.io/en/latest/api/neuralhydrology.datasetzoo.html#neuralhydrology.datasetzoo.get_dataset) you can find the documentation), but the code of this function is as simple as this:

In [8]:
from neuralhydrology.datasetzoo.basedataset import BaseDataset
from neuralhydrology.datasetzoo.camelscl import CamelsCL
from neuralhydrology.datasetzoo.camelsgb import CamelsGB
from neuralhydrology.datasetzoo.camelsus import CamelsUS
from neuralhydrology.datasetzoo.camelsde import CamelsDE
from neuralhydrology.datasetzoo.hourlycamelsus import HourlyCamelsUS
from neuralhydrology.utils.config import Config


def get_dataset(cfg: Config,
                is_train: bool,
                period: str,
                basin: str = None,
                additional_features: list = [],
                id_to_int: dict = {},
                scaler: dict = {}) -> BaseDataset:
    
    # check config argument and select appropriate data set class
    if cfg.dataset == "camels_us":
        Dataset = CamelsUS
    elif cfg.dataset == "camels_gb":
        Dataset = CamelsGB
    elif cfg.dataset == "hourly_camels_us":
        Dataset = HourlyCamelsUS
    elif cfg.dataset == "camels_cl":
        Dataset = CamelsCL
    elif cfg.dataset == "camels_de":
        Dataset = CamelsDE
    else:
        raise NotImplementedError(f"No dataset class implemented for dataset {cfg.dataset}")
    
    # initialize dataset
    ds = Dataset(cfg=cfg,
                 is_train=is_train,
                 period=period,
                 basin=basin,
                 additional_features=additional_features,
                 id_to_int=id_to_int,
                 scaler=scaler)
    return ds

Now, by settig `dataset: camels_cl` in the config file, you are able to train a model on the CAMELS-CL data set. 

The available time series features are:
- tmax_cr2met
- precip_mswep
- streamflow_m3s
- tmin_cr2met
- pet_8d_modis
- precip_chirps
- pet_hargreaves
- streamflow_mm
- precip_cr2met
- swe
- tmean_cr2met
- precip_tmpa

For a list of available attributes, look at the `1_CAMELScl_attributes.txt` file or make use of the above implemented function to load the attributes into a pd.DataFrame.