# Setting Up

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe
import networkx as nx

import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt

from shapely.geometry import Point
from shapely.geometry import Polygon

import glob
import os
import itertools
import tqdm
import gc
import time
import pickle

from joblib import Parallel, delayed

In [2]:
import configparser
cfg = configparser.ConfigParser()
cfg.optionxform = str
cfg.read('/data/sarth/rootdir/assets/global.ini')
cfg = {s: dict(cfg.items(s)) for s in cfg.sections()}
PATHS = cfg['PATHS']

In [3]:
DIRNAME = '03min_GloFAS_CAMELS-US'
SAVE_PATH = os.path.join(PATHS['devp_datasets'], DIRNAME)
resolution = 0.05
lon_360_180 = lambda x: (x + 180) % 360 - 180 # convert 0-360 to -180-180
lon_180_360 = lambda x: x % 360 # convert -180-180 to 0-360
region_bounds = {
    'minx': -130,
    'miny': 20,
    'maxx': -65,
    'maxy': 50
}

# Load Watershed Attributes

In [4]:
camels_attributes_graph = pd.read_csv(os.path.join(SAVE_PATH, 'graph_attributes.csv'), index_col=0)
camels_attributes_graph.index = camels_attributes_graph.index.map(lambda x: str(x).zfill(8))
camels_attributes_graph['huc_02'] = camels_attributes_graph['huc_02'].map(lambda x: str(x).zfill(2))
camels_attributes_graph

Unnamed: 0_level_0,huc_02,gauge_lon,gauge_lat,area_geospa_fabric,snapped_lon,snapped_lat,snapped_uparea,snapped_iou,area_percent_difference,num_nodes,num_edges
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
06452000,10,-99.55649,43.74833,25817.78,-99.525,43.775,26081.820000,0.942874,1.022710,1158.0,1157.0
13340000,17,-116.25750,46.47833,14270.76,-116.275,46.475,14113.031000,0.940266,1.105257,657.0,656.0
06447000,10,-101.52487,43.75250,12869.46,-101.575,43.725,12933.430000,0.929852,0.497066,573.0,572.0
06360500,10,-100.84292,45.25582,12601.47,-100.875,45.275,12741.733000,0.926271,1.113074,584.0,583.0
06354000,10,-100.93444,46.37611,10626.74,-100.925,46.375,10708.444000,0.919406,0.768854,500.0,499.0
...,...,...,...,...,...,...,...,...,...,...,...
14303200,17,-123.54650,45.32428,8.07,-123.525,45.325,21.783020,0.316890,169.925900,1.0,0.0
10336740,16,-119.93546,39.06658,7.94,-119.925,39.075,24.017084,0.186934,202.482150,1.0,0.0
01466500,02,-74.50528,39.88500,6.25,-74.525,39.875,23.746838,0.034301,279.949400,1.0,0.0
01594950,02,-79.39031,39.27669,6.10,-79.425,39.275,23.949966,0.206761,292.622400,1.0,0.0


In [5]:
camels_graph = camels_attributes_graph.copy()
camels_graph = camels_graph[camels_graph['area_percent_difference'] < 10]
print(camels_graph.shape)
camels_graph = camels_graph[camels_graph['num_nodes'] > 1]
print(camels_graph.shape)
# Print the number of graphs per 'huc_02' (sorted in values of huc_02)
camels_graph.sort_values(ascending=True, by = 'huc_02').groupby('huc_02').size()
# camels_graph['huc_02'].value_counts(sort=True)

(404, 11)
(395, 11)


huc_02
01    12
02    39
03    45
04    16
05    26
06    10
07    26
08    10
09     5
10    52
11    26
12    28
13     4
14     5
15    12
16     6
17    52
18    21
dtype: int64

In [6]:
camels_graph['area_geospa_fabric'].describe()

count      395.000000
mean      1158.826506
std       2083.937516
min         43.880000
25%        270.385000
50%        564.960000
75%       1202.835000
max      25817.780000
Name: area_geospa_fabric, dtype: float64

In [7]:
del camels_attributes_graph

# Create Node Features as csv

In [8]:
os.makedirs(os.path.join(SAVE_PATH, "graph_features"), exist_ok = True)

In [9]:
ldd = xr.open_dataset(os.path.join(PATHS['gis_ldd'], 'GloFAS_03min', 'ldd.nc'))
ldd = ldd['ldd']
ldd = ldd.sel(
    lat = slice(region_bounds['maxy'], region_bounds['miny']), 
    lon = slice(region_bounds['minx'], region_bounds['maxx'])
)

lons = ldd['lon'].values
lats = ldd['lat'].values

ds_grid = xr.Dataset({
    'lat': (['lat'], lats),
    'lon': (['lon'], lons),
})

# Round the lat lon values to 3 decimal places in ds_grid
ds_grid['lat'] = ds_grid['lat'].round(3)
ds_grid['lon'] = ds_grid['lon'].round(3)

## GPM

In [10]:
var_names = ['precipitation']
# var_names = ['Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']

dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM'), exist_ok = True)
    data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

Number of dates: 14965
precipitation


100%|██████████| 395/395 [00:04<00:00, 88.85it/s] 


In [None]:
var_names = ['precipitation']
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_zarr(os.path.join(PATHS['GPM'], 'GPM_1998_2020.zarr'), consolidated=True)
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['miny'], region_bounds['maxy']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )

    missing_date = np.datetime64('1999-09-03')
    # Reindex the dataset to include the missing date (it will be filled with NaN)
    new_times = np.sort(np.append(ds.time.values, missing_date))
    ds = ds.reindex(time=new_times).chunk({'time': -1})
    ds = ds.sortby('time')
    # # Interpolate in time to fill NaN values using linear interpolation
    # ds = ds.interpolate_na(dim='time', method='linear')
    # # Chunk in a way that makes it faster
    ds = ds.chunk({'time': 1, 'lat': -1, 'lon': -1})

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder', 'regridder_gpm_to_glofas_03min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder', 'regridder_gpm_to_glofas_03min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder', 'regridder_gpm_to_glofas_03min.nc'))
    
    ds_regrided = regridder(ds)
    ds.close()

    for start_year in range(1998, 2020+1, 5):
        start_date = f"{start_year}-01-01"
        end_date = f"{min(start_year+4,2020)}-12-31"
        ds_window = ds_regrided.sel(time = slice(start_date, end_date)).copy()
        start_time = time.time()
        ds_window.load()
        end_time = time.time()
        print(start_date, end_date, f"Time: {(end_time - start_time)/60:.2f} mins")
    
        def process(idx, row):
            huc, gauge_id = row['huc_02'], row.name
            nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
            data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
            for node_idx, node_row in nodes_coords.iterrows():
                lat, lon = node_row['lat'], node_row['lon']
                ds_window_loc = ds_window.sel(lat = lat, lon = lon, method = 'nearest')
                # data.loc[:, str(node_idx)] = ds_window_loc.values
                data.loc[start_date:end_date, str(node_idx)] = ds_window_loc.values
            data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM', f"{var_name}.csv"))

        with Parallel(n_jobs = 8, verbose = 0) as parallel:
            _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

        # ds.close()
        # del ds
        # gc.collect()
    ds_regrided.close()
    del ds_regrided, ds
    gc.collect()

In [28]:
def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM', f"{var_name}.csv"), index_col = 0, parse_dates = True)

    # Interpolate using a window of 15 days, centered
    missing_date = np.datetime64('1999-09-03')
    window = pd.Timedelta(days=1)
    start_date = missing_date - window
    end_date = missing_date + window
    data_window = data.loc[start_date:end_date]
    data.loc[missing_date] = data_window.mean(axis=0)

    data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM', f"{var_name}.csv"))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

100%|██████████| 395/395 [00:34<00:00, 11.30it/s]


In [30]:
var_names = ['precipitation']
for var_name in var_names:
    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        # Only consider from 1998 onwards
        data = data.loc['1998-01-01':]
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GPM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        data = data.loc['1998-01-01':]
        nodes_coords['isNaN'] = False
        nodes_coords['nonNaNneighbours'] = 0
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
        print(f"Number of nodes with NaN values: {nodes_coords['isNaN'].sum()}")
        print("------")

395it [00:20, 18.88it/s]

Number of catchments with issues: 0
------





## ERA5

### Dynamic

In [12]:
var_names = [
    # '2m_temperature', 
    # 'evaporation', 
    # 'snowfall', 
    # 'surface_net_solar_radiation', 
    # 'surface_net_thermal_radiation', 
    # 'surface_pressure', 
    # 'total_precipitation',
    # '2m_dewpoint_temperature',
    # '10m_u_component_of_wind',
    # '10m_v_component_of_wind',
    # 'forecast_albedo',
    # 'potential_evaporation',
    # 'runoff',
    # 'snow_albedo',
    # 'snow_depth',
    # 'snowmelt',
    # 'sub_surface_runoff',
    # 'surface_runoff',
    # 'total_column_water',
    'volumetric_soil_water_layer_1',
    'volumetric_soil_water_layer_2',
    'volumetric_soil_water_layer_3',
    'volumetric_soil_water_layer_4'
]

dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5'), exist_ok = True)
    data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

Number of dates: 14965
volumetric_soil_water_layer_1


100%|██████████| 395/395 [00:04<00:00, 89.33it/s] 


volumetric_soil_water_layer_2


100%|██████████| 395/395 [00:03<00:00, 110.12it/s]


volumetric_soil_water_layer_3


100%|██████████| 395/395 [00:03<00:00, 107.39it/s]


volumetric_soil_water_layer_4


100%|██████████| 395/395 [00:03<00:00, 109.64it/s]


In [13]:
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_mfdataset(os.path.join(PATHS['RawData'], 'ERA5', var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.rename({'longitude': 'lon', 'latitude': 'lat'})
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds['lon'] = [lon_360_180(lon) for lon in ds['lon'].values]
    ds = ds.sortby('lon')
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    _, index = np.unique(ds['time'], return_index = True)
    ds = ds.isel(time = index)

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_era5_to_glofas_03min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_era5_to_glofas_03min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_era5_to_glofas_03min.nc'))
    
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'Time: {((end_time - start_time) / 60):.4f} mins')
    
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds_regrided.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[:, str(node_idx)] = ds_window_loc.values
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

    ds.close()
    del ds
    gc.collect()

volumetric_soil_water_layer_1
Time: 7.0533 mins


100%|██████████| 395/395 [01:50<00:00,  3.58it/s]


volumetric_soil_water_layer_2
Time: 5.1113 mins


100%|██████████| 395/395 [01:49<00:00,  3.61it/s]


volumetric_soil_water_layer_3
Time: 6.0752 mins


100%|██████████| 395/395 [02:02<00:00,  3.22it/s]


volumetric_soil_water_layer_4
Time: 5.6948 mins


100%|██████████| 395/395 [02:17<00:00,  2.87it/s]


### Static

In [11]:
var_names = [
    'static_soil_type', 
    'static_high_vegetation_cover', 
    'static_low_vegetation_cover', 
    'static_type_of_high_vegetation', 
    'static_type_of_low_vegetation'
    ]
ds_filenames = [
    'soil_type_static.nc',
    'high_vegetation_cover_static.nc',
    'low_vegetation_cover_static.nc',
    'type_of_high_vegetation_static.nc',
    'type_of_low_vegetation_static.nc'
]

for var_name, ds_filename in zip(var_names, ds_filenames):
    print(var_name)
    ds = xr.open_dataset(os.path.join(PATHS['RawData'], 'ERA5', ds_filename))
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.isel(time = 0)
    ds = ds.drop('time')
    ds = ds.rename({'longitude': 'lon', 'latitude': 'lat'})
    ds['lon'] = [lon_360_180(lon) for lon in ds['lon'].values]
    ds = ds.sortby('lon')
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[0, node_idx] = int(ds_window_loc.values)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static'), exist_ok = True)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'ERA5'), exist_ok = True)
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'ERA5', f"{var_name}.csv"))

    for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
        process(idx, row)

static_soil_type


100%|██████████| 395/395 [00:16<00:00, 23.43it/s] 


static_high_vegetation_cover


100%|██████████| 395/395 [00:16<00:00, 24.46it/s] 


static_low_vegetation_cover


100%|██████████| 395/395 [00:16<00:00, 24.66it/s] 


static_type_of_high_vegetation


100%|██████████| 395/395 [00:16<00:00, 24.15it/s] 


static_type_of_low_vegetation


100%|██████████| 395/395 [00:16<00:00, 24.60it/s] 


## HWSD

In [12]:
var_names = ['S_CLAY', 'S_GRAVEL', 'S_SAND', 'S_SILT', 'T_CLAY', 'T_GRAVEL', 'T_SAND', 'T_SILT']

for var_name in var_names:
    print(var_name)
    ds = xr.open_dataset(os.path.join(PATHS['HWSD'], f'{var_name}.nc4'))
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(
        lat = slice(region_bounds['miny'], region_bounds['maxy']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    ds = ds / 100
    ds.load()
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds.sel(
                lat = slice(lat-resolution/2, lat+resolution/2),
                lon = slice(lon-resolution/2, lon+resolution/2)
            ).values.mean()
            data.loc[0, node_idx] = ds_window_loc
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static'), exist_ok = True)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'HWSD'), exist_ok = True)
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'HWSD', f"{var_name}.csv"))

    for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
        process(idx, row)
    
    ds.close()
    del ds
    gc.collect()

S_CLAY


100%|██████████| 395/395 [00:05<00:00, 70.97it/s] 


S_GRAVEL


100%|██████████| 395/395 [00:05<00:00, 71.41it/s] 


S_SAND


100%|██████████| 395/395 [00:05<00:00, 71.37it/s] 


S_SILT


100%|██████████| 395/395 [00:05<00:00, 71.23it/s] 


T_CLAY


100%|██████████| 395/395 [00:05<00:00, 71.11it/s] 


T_GRAVEL


100%|██████████| 395/395 [00:05<00:00, 70.78it/s] 


T_SAND


100%|██████████| 395/395 [00:05<00:00, 68.84it/s] 


T_SILT


100%|██████████| 395/395 [00:05<00:00, 70.81it/s] 


## GLEAM

In [10]:
var_names = ['Ep', 'SMroot', 'SMsurf']

dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM'), exist_ok = True)
    data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

Number of dates: 14965
Ep


100%|██████████| 395/395 [00:04<00:00, 84.44it/s] 


SMroot


100%|██████████| 395/395 [00:03<00:00, 103.03it/s]


SMsurf


100%|██████████| 395/395 [00:03<00:00, 107.87it/s]


In [11]:
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_mfdataset(os.path.join(PATHS['GLEAM'], var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_gleam_to_glofas_03min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_gleam_to_glofas_03min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_gleam_to_glofas_03min.nc'))
    
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'Time: {((end_time - start_time) / 60):.4f} mins')
    
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds_regrided.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[:, str(node_idx)] = ds_window_loc.values
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

    ds.close()
    del ds
    gc.collect()

Ep
Time: 5.1678 mins


100%|██████████| 395/395 [02:11<00:00,  3.01it/s]


SMroot
Time: 2.8857 mins


100%|██████████| 395/395 [02:21<00:00,  2.79it/s]


SMsurf
Time: 2.7060 mins


100%|██████████| 395/395 [03:06<00:00,  2.12it/s]


### Fix NaNs

In [None]:
var_names = ['Ep', 'SMroot', 'SMsurf']
for var_name in var_names:
    ds = xr.open_mfdataset(os.path.join(PATHS['GLEAM'], var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_gleam_to_glofas_03min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_gleam_to_glofas_03min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_gleam_to_glofas_03min.nc'))
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'{var_name} (Time: {((end_time - start_time) / 60):.4f} mins)')

    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        nodes_coords['isNaN'] = False
        nodes_coords['nonNaNneighbours'] = 0
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
                node_lat = float(round(nodes_coords.loc[node_idx, 'lat'], 3))
                node_lon = float(round(nodes_coords.loc[node_idx, 'lon'], 3))
                multiplier = 1.5
                ds_slice = ds_regrided.sel(
                    lat = slice(node_lat+multiplier*resolution, node_lat-multiplier*resolution), 
                    lon = slice(node_lon-multiplier*resolution, node_lon+multiplier*resolution)
                    )
                slice_df = ds_slice.to_dataframe(name = var_name).reset_index()
                slice_df['lat'] = slice_df['lat'].round(3)
                slice_df['lon'] = slice_df['lon'].round(3)
                slice_df['location'] = list(zip(slice_df['lat'], slice_df['lon']))
                slice_df = slice_df.pivot(index='time', columns='location', values=var_name)
                num_nan_nodes = slice_df.isnull().any(axis=0).sum()
                num_nonnan_nodes = len(slice_df.columns) - num_nan_nodes
                nodes_coords.loc[node_idx, 'nonNaNneighbours'] = num_nonnan_nodes
        nodes_coords_sorted = nodes_coords.sort_values(by = 'nonNaNneighbours', ascending = False)
        nodes_coords_sorted = nodes_coords_sorted[nodes_coords_sorted['isNaN']]
        print(f"Number of nodes with NaN values: {nodes_coords_sorted.shape[0]}")
        
        for node_idx in tqdm.tqdm(nodes_coords_sorted.index):
            node_lat, node_lon = float(round(nodes_coords.loc[node_idx, 'lat'], 3)), float(round(nodes_coords.loc[node_idx, 'lon'], 3))
            multiplier = 1.5
            ds_slice = ds_regrided.sel(
                lat = slice(node_lat+multiplier*resolution, node_lat-multiplier*resolution), 
                lon = slice(node_lon-multiplier*resolution, node_lon+multiplier*resolution)
                )
            slice_df = ds_slice.to_dataframe(name = var_name).reset_index()
            slice_df['lat'] = slice_df['lat'].round(3)
            slice_df['lon'] = slice_df['lon'].round(3)
            slice_df['location'] = list(zip(slice_df['lat'], slice_df['lon']))
            slice_df = slice_df.pivot(index='time', columns='location', values=var_name)
            slice_df.columns = list(map(str, slice_df.columns))
            num_nonnan_nodes = len(slice_df.columns) - slice_df.isnull().any(axis=0).sum()
            # print(node_idx, (node_lat, node_lon), num_nonnan_nodes)
            if num_nonnan_nodes == 9:
                replacement_values = slice_df.loc[:, f"({node_lat}, {node_lon})"]
                data.loc[:, str(node_idx)] = replacement_values
                nodes_coords_sorted.loc[node_idx, 'isNaN'] = False
            elif num_nonnan_nodes > 0:
                replacement_values = np.nanmean(slice_df, axis = 1)
                data.loc[:, str(node_idx)] = replacement_values
                ds_regrided.loc[dict(lat = node_lat, lon = node_lon)] = replacement_values
                nodes_coords_sorted.loc[node_idx, 'isNaN'] = False
        print(f"Number of nodes with NaN values: {nodes_coords_sorted['isNaN'].sum()}")
        print(issue_idx, huc, gauge_id, data.isnull().values.any())
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"))
        print("------")

Ep (Time: 3.2873 mins)


395it [01:56,  3.38it/s]


Number of catchments with issues: 24
------
0 10 06191500
Number of nodes with NaN values: 81


100%|██████████| 81/81 [00:05<00:00, 13.72it/s]


Number of nodes with NaN values: 11
0 10 06191500 True
------
1 04 04056500
Number of nodes with NaN values: 6


100%|██████████| 6/6 [00:00<00:00, 13.55it/s]


Number of nodes with NaN values: 0
1 04 04056500 False
------
2 10 06043500
Number of nodes with NaN values: 37


100%|██████████| 37/37 [00:02<00:00, 13.78it/s]


Number of nodes with NaN values: 1
2 10 06043500 True
------
3 04 04045500
Number of nodes with NaN values: 41


100%|██████████| 41/41 [00:02<00:00, 13.79it/s]


Number of nodes with NaN values: 0
3 04 04045500 False
------
4 10 06188000
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 13.59it/s]

Number of nodes with NaN values: 0
4 10 06188000 False





------
5 18 11532500
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 13.53it/s]

Number of nodes with NaN values: 0
5 18 11532500 False





------
6 03 02196000
Number of nodes with NaN values: 16


100%|██████████| 16/16 [00:01<00:00, 13.48it/s]


Number of nodes with NaN values: 0
6 03 02196000 False
------
7 08 08014500
Number of nodes with NaN values: 7


100%|██████████| 7/7 [00:00<00:00, 13.57it/s]


Number of nodes with NaN values: 0
7 08 08014500 False
------
8 08 08013000
Number of nodes with NaN values: 7


100%|██████████| 7/7 [00:00<00:00, 13.74it/s]


Number of nodes with NaN values: 0
8 08 08013000 False
------
9 17 12040500
Number of nodes with NaN values: 13


100%|██████████| 13/13 [00:00<00:00, 13.74it/s]


Number of nodes with NaN values: 0
9 17 12040500 False
------
10 10 06919500
Number of nodes with NaN values: 39


100%|██████████| 39/39 [00:02<00:00, 13.57it/s]


Number of nodes with NaN values: 2
10 10 06919500 True
------
11 07 05495000
Number of nodes with NaN values: 10


100%|██████████| 10/10 [00:00<00:00, 13.45it/s]


Number of nodes with NaN values: 0
11 07 05495000 False
------
12 17 12411000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 13.31it/s]


Number of nodes with NaN values: 0
12 17 12411000 False
------
13 03 02481510
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 13.37it/s]


Number of nodes with NaN values: 0
13 03 02481510 False
------
14 07 05413500
Number of nodes with NaN values: 20


100%|██████████| 20/20 [00:01<00:00, 13.46it/s]


Number of nodes with NaN values: 1
14 07 05413500 True
------
15 17 12041200
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 13.29it/s]

Number of nodes with NaN values: 0
15 17 12041200 False





------
16 17 12048000
Number of nodes with NaN values: 9


100%|██████████| 9/9 [00:00<00:00, 13.41it/s]


Number of nodes with NaN values: 0
16 17 12048000 False
------
17 12 08025500
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 13.20it/s]


Number of nodes with NaN values: 0
17 12 08025500 False
------
18 07 05414000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 13.31it/s]


Number of nodes with NaN values: 0
18 07 05414000 False
------
19 08 07359610
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 13.26it/s]


Number of nodes with NaN values: 0
19 08 07359610 False
------
20 18 11468500
Number of nodes with NaN values: 5


100%|██████████| 5/5 [00:00<00:00, 13.35it/s]


Number of nodes with NaN values: 1
20 18 11468500 True
------
21 03 02481000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 13.29it/s]


Number of nodes with NaN values: 0
21 03 02481000 False
------
22 04 04015330
Number of nodes with NaN values: 9


100%|██████████| 9/9 [00:00<00:00, 13.38it/s]


Number of nodes with NaN values: 0
22 04 04015330 False
------
23 04 04043050
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 13.38it/s]


Number of nodes with NaN values: 0
23 04 04043050 False
------
SMroot (Time: 9.0868 mins)


395it [01:24,  4.68it/s]


Number of catchments with issues: 25
------
0 10 06191500
Number of nodes with NaN values: 81


100%|██████████| 81/81 [00:06<00:00, 11.89it/s]


Number of nodes with NaN values: 11
0 10 06191500 True
------
1 04 04056500
Number of nodes with NaN values: 6


100%|██████████| 6/6 [00:00<00:00, 10.23it/s]


Number of nodes with NaN values: 0
1 04 04056500 False
------
2 10 06043500
Number of nodes with NaN values: 37


100%|██████████| 37/37 [00:03<00:00, 11.52it/s]


Number of nodes with NaN values: 1
2 10 06043500 True
------
3 04 04045500
Number of nodes with NaN values: 41


100%|██████████| 41/41 [00:03<00:00, 11.36it/s]


Number of nodes with NaN values: 0
3 04 04045500 False
------
4 10 06188000
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00,  9.53it/s]

Number of nodes with NaN values: 0
4 10 06188000 False





------
5 18 11532500
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00,  9.36it/s]


Number of nodes with NaN values: 0
5 18 11532500 False
------
6 03 02196000
Number of nodes with NaN values: 16


100%|██████████| 16/16 [00:01<00:00, 10.40it/s]


Number of nodes with NaN values: 0
6 03 02196000 False
------
7 08 08014500
Number of nodes with NaN values: 7


100%|██████████| 7/7 [00:00<00:00, 10.67it/s]


Number of nodes with NaN values: 0
7 08 08014500 False
------
8 08 08013000
Number of nodes with NaN values: 7


100%|██████████| 7/7 [00:00<00:00, 10.28it/s]


Number of nodes with NaN values: 0
8 08 08013000 False
------
9 17 12040500
Number of nodes with NaN values: 13


100%|██████████| 13/13 [00:01<00:00,  8.87it/s]


Number of nodes with NaN values: 0
9 17 12040500 False
------
10 10 06919500
Number of nodes with NaN values: 39


100%|██████████| 39/39 [00:03<00:00, 10.11it/s]


Number of nodes with NaN values: 2
10 10 06919500 True
------
11 07 05495000
Number of nodes with NaN values: 10


100%|██████████| 10/10 [00:01<00:00,  9.79it/s]


Number of nodes with NaN values: 0
11 07 05495000 False
------
12 17 12411000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 11.07it/s]


Number of nodes with NaN values: 0
12 17 12411000 False
------
13 03 02481510
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00,  9.83it/s]


Number of nodes with NaN values: 0
13 03 02481510 False
------
14 07 05413500
Number of nodes with NaN values: 20


100%|██████████| 20/20 [00:01<00:00, 11.15it/s]


Number of nodes with NaN values: 1
14 07 05413500 True
------
15 17 12041200
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00,  9.71it/s]


Number of nodes with NaN values: 0
15 17 12041200 False
------
16 17 12048000
Number of nodes with NaN values: 9


100%|██████████| 9/9 [00:01<00:00,  8.68it/s]


Number of nodes with NaN values: 0
16 17 12048000 False
------
17 18 11148900
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00,  9.87it/s]


Number of nodes with NaN values: 0
17 18 11148900 False
------
18 12 08025500
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 10.21it/s]


Number of nodes with NaN values: 0
18 12 08025500 False
------
19 07 05414000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 10.30it/s]


Number of nodes with NaN values: 0
19 07 05414000 False
------
20 08 07359610
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 11.05it/s]

Number of nodes with NaN values: 0
20 08 07359610 False





------
21 18 11468500
Number of nodes with NaN values: 5


100%|██████████| 5/5 [00:00<00:00, 11.31it/s]


Number of nodes with NaN values: 1
21 18 11468500 True
------
22 03 02481000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 11.56it/s]


Number of nodes with NaN values: 0
22 03 02481000 False
------
23 04 04015330
Number of nodes with NaN values: 9


100%|██████████| 9/9 [00:00<00:00,  9.91it/s]


Number of nodes with NaN values: 0
23 04 04015330 False
------
24 04 04043050
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 10.18it/s]

Number of nodes with NaN values: 0
24 04 04043050 False





------
SMsurf (Time: 5.5462 mins)


395it [01:12,  5.42it/s]


Number of catchments with issues: 25
------
0 10 06191500
Number of nodes with NaN values: 81


100%|██████████| 81/81 [00:07<00:00, 10.40it/s]


Number of nodes with NaN values: 11
0 10 06191500 True
------
1 04 04056500
Number of nodes with NaN values: 6


100%|██████████| 6/6 [00:00<00:00, 10.22it/s]


Number of nodes with NaN values: 0
1 04 04056500 False
------
2 10 06043500
Number of nodes with NaN values: 37


100%|██████████| 37/37 [00:04<00:00,  8.14it/s]


Number of nodes with NaN values: 1
2 10 06043500 True
------
3 04 04045500
Number of nodes with NaN values: 41


100%|██████████| 41/41 [00:04<00:00,  9.91it/s]


Number of nodes with NaN values: 0
3 04 04045500 False
------
4 10 06188000
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 10.04it/s]

Number of nodes with NaN values: 0
4 10 06188000 False





------
5 18 11532500
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 12.22it/s]

Number of nodes with NaN values: 0
5 18 11532500 False





------
6 03 02196000
Number of nodes with NaN values: 16


100%|██████████| 16/16 [00:01<00:00, 12.45it/s]


Number of nodes with NaN values: 0
6 03 02196000 False
------
7 08 08014500
Number of nodes with NaN values: 7


100%|██████████| 7/7 [00:00<00:00, 12.53it/s]


Number of nodes with NaN values: 0
7 08 08014500 False
------
8 08 08013000
Number of nodes with NaN values: 7


100%|██████████| 7/7 [00:00<00:00, 12.44it/s]


Number of nodes with NaN values: 0
8 08 08013000 False
------
9 17 12040500
Number of nodes with NaN values: 13


100%|██████████| 13/13 [00:01<00:00, 12.47it/s]


Number of nodes with NaN values: 0
9 17 12040500 False
------
10 10 06919500
Number of nodes with NaN values: 39


100%|██████████| 39/39 [00:03<00:00, 12.43it/s]


Number of nodes with NaN values: 2
10 10 06919500 True
------
11 07 05495000
Number of nodes with NaN values: 10


100%|██████████| 10/10 [00:00<00:00, 11.80it/s]


Number of nodes with NaN values: 0
11 07 05495000 False
------
12 17 12411000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 12.25it/s]


Number of nodes with NaN values: 0
12 17 12411000 False
------
13 03 02481510
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 12.33it/s]


Number of nodes with NaN values: 0
13 03 02481510 False
------
14 07 05413500
Number of nodes with NaN values: 20


100%|██████████| 20/20 [00:01<00:00, 12.16it/s]


Number of nodes with NaN values: 1
14 07 05413500 True
------
15 17 12041200
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 12.40it/s]

Number of nodes with NaN values: 0
15 17 12041200 False





------
16 17 12048000
Number of nodes with NaN values: 9


100%|██████████| 9/9 [00:00<00:00, 12.46it/s]


Number of nodes with NaN values: 0
16 17 12048000 False
------
17 18 11148900
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00,  8.02it/s]


Number of nodes with NaN values: 0
17 18 11148900 False
------
18 12 08025500
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 10.69it/s]


Number of nodes with NaN values: 0
18 12 08025500 False
------
19 07 05414000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 10.71it/s]


Number of nodes with NaN values: 0
19 07 05414000 False
------
20 08 07359610
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 12.22it/s]

Number of nodes with NaN values: 0
20 08 07359610 False





------
21 18 11468500
Number of nodes with NaN values: 5


100%|██████████| 5/5 [00:00<00:00, 12.42it/s]


Number of nodes with NaN values: 1
21 18 11468500 True
------
22 03 02481000
Number of nodes with NaN values: 4


100%|██████████| 4/4 [00:00<00:00, 12.33it/s]


Number of nodes with NaN values: 0
22 03 02481000 False
------
23 04 04015330
Number of nodes with NaN values: 9


100%|██████████| 9/9 [00:00<00:00, 12.32it/s]


Number of nodes with NaN values: 0
23 04 04015330 False
------
24 04 04043050
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 12.10it/s]

Number of nodes with NaN values: 0
24 04 04043050 False





------


In [14]:
var_names = ['Ep', 'SMroot', 'SMsurf']
for var_name in var_names:
    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        nodes_coords['isNaN'] = False
        nodes_coords['nonNaNneighbours'] = 0
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
        print(f"Number of nodes with NaN values: {nodes_coords['isNaN'].sum()}")
        print("------")

395it [00:57,  6.87it/s]


Number of catchments with issues: 5
------
0 10 06191500
Number of nodes with NaN values: 11
------
1 10 06043500
Number of nodes with NaN values: 1
------
2 10 06919500
Number of nodes with NaN values: 2
------
3 07 05413500
Number of nodes with NaN values: 1
------
4 18 11468500
Number of nodes with NaN values: 1
------


395it [01:23,  4.73it/s]


Number of catchments with issues: 5
------
0 10 06191500
Number of nodes with NaN values: 11
------
1 10 06043500
Number of nodes with NaN values: 1
------
2 10 06919500
Number of nodes with NaN values: 2
------
3 07 05413500
Number of nodes with NaN values: 1
------
4 18 11468500
Number of nodes with NaN values: 1
------


395it [00:56,  6.96it/s]


Number of catchments with issues: 5
------
0 10 06191500
Number of nodes with NaN values: 11
------
1 10 06043500
Number of nodes with NaN values: 1
------
2 10 06919500
Number of nodes with NaN values: 2
------
3 07 05413500
Number of nodes with NaN values: 1
------
4 18 11468500
Number of nodes with NaN values: 1
------


In [24]:
var_names = ['Ep', 'SMroot', 'SMsurf']
for var_name in var_names:
    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        nodes_coords['isNaN'] = False
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
        print(f"Number of nodes with NaN values: {nodes_coords['isNaN'].sum()}")

        
        for node_idx in tqdm.tqdm(nodes_coords[nodes_coords['isNaN']].index):
            nodes_coords['distances'] = None
            node_lat, node_lon = float(round(nodes_coords.loc[node_idx, 'lat'], 3)), float(round(nodes_coords.loc[node_idx, 'lon'], 3))
            for node_idx2 in nodes_coords[nodes_coords['isNaN'] == False].index:
                if node_idx != node_idx2:
                    node_lat2, node_lon2 = float(round(nodes_coords.loc[node_idx2, 'lat'], 3)), float(round(nodes_coords.loc[node_idx2, 'lon'], 3))
                    distance = np.sqrt((node_lat - node_lat2)**2 + (node_lon - node_lon2)**2)
                    nodes_coords.loc[node_idx2, 'distances'] = distance
            min_distance = nodes_coords.loc[nodes_coords['distances'].idxmin(), 'distances']
            # Replace with mean of nodes having distance equal to min_distance
            replacement_nodes = nodes_coords[nodes_coords['distances'] == min_distance].index
            replacement_nodes = list(map(str, replacement_nodes))
            replacement_values = data.loc[:, replacement_nodes].mean(axis = 1)
            data.loc[:, str(node_idx)] = replacement_values
            nodes_coords.loc[node_idx, 'isNaN'] = False
        print(f"Number of nodes with NaN values: {nodes_coords['isNaN'].sum()}")
        print(issue_idx, huc, gauge_id, data.isnull().values.any())
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"))
        print("------")

395it [00:42,  9.35it/s]


Number of catchments with issues: 5
------
0 10 06191500
Number of nodes with NaN values: 11


100%|██████████| 11/11 [00:00<00:00, 22.16it/s]


Number of nodes with NaN values: 0
0 10 06191500 False
------
1 10 06043500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 53.97it/s]

Number of nodes with NaN values: 0
1 10 06043500 False





------
2 10 06919500
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 107.98it/s]

Number of nodes with NaN values: 0
2 10 06919500 False





------
3 07 05413500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 111.60it/s]

Number of nodes with NaN values: 0
3 07 05413500 False





------
4 18 11468500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 194.24it/s]

Number of nodes with NaN values: 0
4 18 11468500 False





------


395it [00:53,  7.38it/s]


Number of catchments with issues: 5
------
0 10 06191500
Number of nodes with NaN values: 11


100%|██████████| 11/11 [00:00<00:00, 17.77it/s]


Number of nodes with NaN values: 0
0 10 06191500 False
------
1 10 06043500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 45.55it/s]

Number of nodes with NaN values: 0
1 10 06043500 False





------
2 10 06919500
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 86.29it/s]

Number of nodes with NaN values: 0
2 10 06919500 False





------
3 07 05413500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 109.33it/s]

Number of nodes with NaN values: 0
3 07 05413500 False





------
4 18 11468500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 102.25it/s]

Number of nodes with NaN values: 0
4 18 11468500 False





------


395it [00:54,  7.18it/s]


Number of catchments with issues: 5
------
0 10 06191500
Number of nodes with NaN values: 11


100%|██████████| 11/11 [00:00<00:00, 17.54it/s]


Number of nodes with NaN values: 0
0 10 06191500 False
------
1 10 06043500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 37.41it/s]

Number of nodes with NaN values: 0
1 10 06043500 False





------
2 10 06919500
Number of nodes with NaN values: 2


100%|██████████| 2/2 [00:00<00:00, 68.02it/s]

Number of nodes with NaN values: 0
2 10 06919500 False





------
3 07 05413500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 107.69it/s]

Number of nodes with NaN values: 0
3 07 05413500 False





------
4 18 11468500
Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 152.98it/s]

Number of nodes with NaN values: 0
4 18 11468500 False





------


### GLEAM4

In [None]:
# var_names = ['Ep', 'SMrz', 'SMs']#, 'Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']
var_names = ['Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']

dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4'), exist_ok = True)
    data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

In [None]:
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_mfdataset(os.path.join(PATHS['GLEAM'], 'GLEAM4.2a', var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder', 'regridder_gleam4_to_glofas_03min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder', 'regridder_gleam4_to_glofas_03min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder', 'regridder_gleam4_to_glofas_03min.nc'))
    
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'Time: {((end_time - start_time) / 60):.4f} mins')
    
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds_regrided.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[:, str(node_idx)] = ds_window_loc.values
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

    ds.close()
    del ds
    gc.collect()

In [None]:
# var_names = ['Ep', 'SMrz', 'SMs']#, 'Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']
var_names = ['Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']
for var_name in var_names:
    ds = xr.open_mfdataset(os.path.join(PATHS['GLEAM'], 'GLEAM4.2a', var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder', 'regridder_gleam4_to_glofas_03min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder', 'regridder_gleam4_to_glofas_03min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder', 'regridder_gleam4_to_glofas_03min.nc'))
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'{var_name} (Time: {((end_time - start_time) / 60):.4f} mins)')

    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        nodes_coords['isNaN'] = False
        nodes_coords['nonNaNneighbours'] = 0
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
                node_lat = float(round(nodes_coords.loc[node_idx, 'lat'], 3))
                node_lon = float(round(nodes_coords.loc[node_idx, 'lon'], 3))
                multiplier = 1.5
                ds_slice = ds_regrided.sel(
                    lat = slice(node_lat+multiplier*resolution, node_lat-multiplier*resolution), 
                    lon = slice(node_lon-multiplier*resolution, node_lon+multiplier*resolution)
                    )
                slice_df = ds_slice.to_dataframe(name = var_name).reset_index()
                slice_df['lat'] = slice_df['lat'].round(3)
                slice_df['lon'] = slice_df['lon'].round(3)
                slice_df['location'] = list(zip(slice_df['lat'], slice_df['lon']))
                slice_df = slice_df.pivot(index='time', columns='location', values=var_name)
                num_nan_nodes = slice_df.isnull().any(axis=0).sum()
                num_nonnan_nodes = len(slice_df.columns) - num_nan_nodes
                nodes_coords.loc[node_idx, 'nonNaNneighbours'] = num_nonnan_nodes
        nodes_coords_sorted = nodes_coords.sort_values(by = 'nonNaNneighbours', ascending = False)
        nodes_coords_sorted = nodes_coords_sorted[nodes_coords_sorted['isNaN']]
        print(f"Number of nodes with NaN values: {nodes_coords_sorted.shape[0]}")
        
        for node_idx in tqdm.tqdm(nodes_coords_sorted.index):
            node_lat, node_lon = float(round(nodes_coords.loc[node_idx, 'lat'], 3)), float(round(nodes_coords.loc[node_idx, 'lon'], 3))
            multiplier = 1.5
            ds_slice = ds_regrided.sel(
                lat = slice(node_lat+multiplier*resolution, node_lat-multiplier*resolution), 
                lon = slice(node_lon-multiplier*resolution, node_lon+multiplier*resolution)
                )
            slice_df = ds_slice.to_dataframe(name = var_name).reset_index()
            slice_df['lat'] = slice_df['lat'].round(3)
            slice_df['lon'] = slice_df['lon'].round(3)
            slice_df['location'] = list(zip(slice_df['lat'], slice_df['lon']))
            slice_df = slice_df.pivot(index='time', columns='location', values=var_name)
            slice_df.columns = list(map(str, slice_df.columns))
            num_nonnan_nodes = len(slice_df.columns) - slice_df.isnull().any(axis=0).sum()
            # print(node_idx, (node_lat, node_lon), num_nonnan_nodes)
            if num_nonnan_nodes == 9:
                replacement_values = slice_df.loc[:, f"({node_lat}, {node_lon})"]
                data.loc[:, str(node_idx)] = replacement_values
                nodes_coords_sorted.loc[node_idx, 'isNaN'] = False
            elif num_nonnan_nodes > 0:
                replacement_values = np.nanmean(slice_df, axis = 1)
                data.loc[:, str(node_idx)] = replacement_values
                ds_regrided.loc[dict(lat = node_lat, lon = node_lon)] = replacement_values
                nodes_coords_sorted.loc[node_idx, 'isNaN'] = False
        print(f"Number of nodes with NaN values: {nodes_coords_sorted['isNaN'].sum()}")
        print(issue_idx, huc, gauge_id, data.isnull().values.any())
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"))
        print("------")

In [None]:
# var_names = ['Ep', 'SMrz', 'SMs']#, 'Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']
var_names = ['Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']
for var_name in var_names:
    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        nodes_coords['isNaN'] = False
        nodes_coords['nonNaNneighbours'] = 0
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
        print(f"Number of nodes with NaN values: {nodes_coords['isNaN'].sum()}")
        print("------")

In [None]:
# var_names = ['Ep', 'SMrz', 'SMs']#, 'Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']
var_names = ['Eb', 'Ei', 'Es', 'Et', 'Ew', 'S', 'H']
for var_name in var_names:
    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        nodes_coords['isNaN'] = False
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
        print(f"Number of nodes with NaN values: {nodes_coords['isNaN'].sum()}")

        
        for node_idx in tqdm.tqdm(nodes_coords[nodes_coords['isNaN']].index):
            nodes_coords['distances'] = None
            node_lat, node_lon = float(round(nodes_coords.loc[node_idx, 'lat'], 3)), float(round(nodes_coords.loc[node_idx, 'lon'], 3))
            for node_idx2 in nodes_coords[nodes_coords['isNaN'] == False].index:
                if node_idx != node_idx2:
                    node_lat2, node_lon2 = float(round(nodes_coords.loc[node_idx2, 'lat'], 3)), float(round(nodes_coords.loc[node_idx2, 'lon'], 3))
                    distance = np.sqrt((node_lat - node_lat2)**2 + (node_lon - node_lon2)**2)
                    nodes_coords.loc[node_idx2, 'distances'] = distance
            min_distance = nodes_coords.loc[nodes_coords['distances'].idxmin(), 'distances']
            # Replace with mean of nodes having distance equal to min_distance
            replacement_nodes = nodes_coords[nodes_coords['distances'] == min_distance].index
            replacement_nodes = list(map(str, replacement_nodes))
            replacement_values = data.loc[:, replacement_nodes].mean(axis = 1)
            data.loc[:, str(node_idx)] = replacement_values
            nodes_coords.loc[node_idx, 'isNaN'] = False
        print(f"Number of nodes with NaN values: {nodes_coords['isNaN'].sum()}")
        print(issue_idx, huc, gauge_id, data.isnull().values.any())
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM4', f"{var_name}.csv"))
        print("------")

## Solar Insolation

In [13]:
def solar_insolation(lat, lon, start_date, end_date):
    # Constants
    Sc = 1361  # Solar constant (W/m^2)
    
    # Convert dates to datetime objects
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    # Generate date range
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    dates = dates[~((dates.month == 2) & (dates.day == 29))]
    
    # Function to calculate solar declination
    def solar_declination(n):
        return 23.45 * np.sin(np.radians((360 / 365) * (n - 81)))

    # Function to calculate cos(theta_z) for solar zenith angle
    def cos_theta_z(lat, decl, hour_angle):
        lat_rad = np.radians(lat)
        decl_rad = np.radians(decl)
        return (np.sin(lat_rad) * np.sin(decl_rad) + 
                np.cos(lat_rad) * np.cos(decl_rad) * np.cos(np.radians(hour_angle)))
    
    # Function to calculate the hour angle
    def hour_angle(lon, date):
        # Assuming solar noon (local solar time = 12 hours)
        return 0  # hour angle at solar noon
    
    # Calculate solar insolation for each day
    insolation_values = []
    for date in dates:
        day_of_year = date.day_of_year
        declination = solar_declination(day_of_year)
        h = hour_angle(lon, date)
        cos_zenith_angle = cos_theta_z(lat, declination, h)
        
        # Insolation formula
        insolation = Sc * (1 + 0.033 * np.cos(np.radians(360 * day_of_year / 365))) * cos_zenith_angle
        
        # Make sure insolation is non-negative
        insolation = max(insolation, 0)
        insolation_values.append(insolation)
    
    # Create pandas Series
    insolation_series = pd.Series(insolation_values, index=dates, name='Solar Insolation (kW/m²)')
    insolation_series = insolation_series / 1000  # Convert to kW/m²
    
    return insolation_series

In [14]:
dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]

def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)

    data = pd.DataFrame(columns = nodes_coords.index, index = dates)
    for node_idx, node_row in nodes_coords.iterrows():
        lat, lon = node_row['lat'], node_row['lon']
        ds_window_loc = solar_insolation(lat, lon, '1980-01-01', '2020-12-31')
        data.loc[:, node_idx] = ds_window_loc.values

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'solar_insolation.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

100%|██████████| 395/395 [07:07<00:00,  1.08s/it]


## Time Encodings

In [15]:
def sine_time_encoding(start_date, end_date):
    # (a) Create a date_range and remove leap days
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    dates = dates[~((dates.month == 2) & (dates.day == 29))]  # Remove February 29 (leap days)
    
    # (b) Create a dataframe with 'month', 'weekofyear', 'dayofyear' columns
    df = pd.DataFrame(index=dates)
    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['dayofyear'] = df.index.dayofyear
    
    # (c) Define lambda transformations for sine encoding
    # For day of year (range 1-365), week of year (range 1-52), and month (range 1-12)
    sine_transform = lambda x, max_val: np.sin(2 * np.pi * x / max_val)
    
    # (d) Apply sine transformation and add transformed columns
    df['sine_month'] = df['month'].apply(sine_transform, max_val=12)
    df['sine_weekofyear'] = df['weekofyear'].apply(sine_transform, max_val=52)
    df['sine_dayofyear'] = df['dayofyear'].apply(sine_transform, max_val=365)
    
    # return df[['sine_month', 'sine_weekofyear', 'sine_dayofyear']]
    return df

In [16]:
df_encoded = sine_time_encoding('1980-01-01', '2020-12-31')

def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    df_encoded.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'time_encodings.csv'))

# with Parallel(n_jobs = 8, verbose = 0) as parallel:
    # _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
    process(idx, row)

100%|██████████| 395/395 [00:39<00:00, 10.11it/s]


## Daymet

In [11]:
dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")
var_names = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'dayl']

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet'), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

Number of dates: 14965
prcp


100%|██████████| 395/395 [00:11<00:00, 35.25it/s] 


srad


100%|██████████| 395/395 [00:03<00:00, 102.33it/s]


swe


100%|██████████| 395/395 [00:03<00:00, 111.75it/s]


tmax


100%|██████████| 395/395 [00:04<00:00, 89.38it/s] 


tmin


100%|██████████| 395/395 [00:03<00:00, 107.08it/s]


vp


100%|██████████| 395/395 [00:03<00:00, 104.90it/s]


dayl


100%|██████████| 395/395 [00:03<00:00, 107.02it/s]


In [12]:
missing_dates = ['1980-12-31', '1984-12-31', '1988-12-31', '1992-12-31',
               '1996-12-31', '2000-12-31', '2004-12-31', '2008-12-31',
               '2012-12-31', '2016-12-31', '2020-12-31']
missing_dates = [date + 'T12:00:00' for date in missing_dates]
missing_dates = np.array(missing_dates, dtype = 'datetime64')
ds_missing_dates = xr.DataArray(
    np.nan*np.zeros((len(missing_dates), len(lats), len(lons))),
    coords = [missing_dates, lats, lons],
    dims = ['time', 'lat', 'lon']
)

In [13]:
var_names = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'dayl']
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_mfdataset(os.path.join(PATHS['Daymet'], var_name, f"*.nc"), combine='by_coords')
    ds = ds[var_name]
    ds = ds.rename({'x': 'lon', 'y': 'lat'})
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_daymet_to_glofas_03min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_daymet_to_glofas_03min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_daymet_to_glofas_03min.nc'))
    
    ds_regrided = regridder(ds)
    # ds_regrided['time'] = ds_regrided['time'].dt.floor('D')
    # ds_regrided['time'] = [np.datetime64(str(date).split('T')[0]) for date in ds_regrided['time'].values]

    # Concatenate missing dates
    ds_regrided = xr.concat([ds_regrided, ds_missing_dates], dim = 'time')
    ds_regrided = ds_regrided.sortby('time')
    ds.close()
    # Print length of time
    print(f"timesteps: {len(ds_regrided['time'])}")

    for start_year in range(1980, 2020+1, 5):
        start_date = f"{start_year}-01-01"
        end_date = f"{min(start_year+4,2020)}-12-31"
        ds_window = ds_regrided.sel(time = slice(start_date, end_date)).copy()
        start_time = time.time()
        ds_window.load()
        end_time = time.time()
        print(start_date, end_date, f"Time: {(end_time - start_time)/60:.2f} mins")
    
        def process(idx, row):
            huc, gauge_id = row['huc_02'], row.name
            nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
            data = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"), index_col = 0, parse_dates = True)
            for node_idx, node_row in nodes_coords.iterrows():
                lat, lon = node_row['lat'], node_row['lon']
                ds_window_loc = ds_window.sel(lat = lat, lon = lon, method = 'nearest')
                data.loc[start_date:end_date, str(node_idx)] = ds_window_loc.values
            data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"))
            return None
        
        with Parallel(n_jobs = 8, verbose = 0) as parallel:
            _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

        ds_window.close()
        del ds_window
        gc.collect()

    ds_regrided.close()
    del ds, ds_regrided
    gc.collect()

prcp
timesteps: 14965
1980-01-01 1984-12-31 Time: 7.07 mins


100%|██████████| 395/395 [00:23<00:00, 17.15it/s] 


1985-01-01 1989-12-31 Time: 7.37 mins


100%|██████████| 395/395 [00:26<00:00, 15.11it/s] 


1990-01-01 1994-12-31 Time: 7.38 mins


100%|██████████| 395/395 [00:28<00:00, 14.08it/s] 


1995-01-01 1999-12-31 Time: 7.27 mins


100%|██████████| 395/395 [00:33<00:00, 11.89it/s]


2000-01-01 2004-12-31 Time: 6.86 mins


100%|██████████| 395/395 [00:33<00:00, 11.80it/s]


2005-01-01 2009-12-31 Time: 7.17 mins


100%|██████████| 395/395 [00:38<00:00, 10.33it/s]


2010-01-01 2014-12-31 Time: 7.16 mins


100%|██████████| 395/395 [00:41<00:00,  9.60it/s]


2015-01-01 2019-12-31 Time: 7.34 mins


100%|██████████| 395/395 [00:42<00:00,  9.23it/s]


2020-01-01 2020-12-31 Time: 1.63 mins


100%|██████████| 395/395 [00:37<00:00, 10.65it/s]


srad
timesteps: 14965
1980-01-01 1984-12-31 Time: 7.54 mins


100%|██████████| 395/395 [00:30<00:00, 13.16it/s]


1985-01-01 1989-12-31 Time: 7.98 mins


100%|██████████| 395/395 [00:39<00:00, 10.05it/s]


1990-01-01 1994-12-31 Time: 7.92 mins


100%|██████████| 395/395 [00:44<00:00,  8.87it/s]


1995-01-01 1999-12-31 Time: 7.79 mins


100%|██████████| 395/395 [00:51<00:00,  7.73it/s]


2000-01-01 2004-12-31 Time: 7.42 mins


100%|██████████| 395/395 [00:58<00:00,  6.71it/s]


2005-01-01 2009-12-31 Time: 7.82 mins


100%|██████████| 395/395 [01:07<00:00,  5.88it/s]


2010-01-01 2014-12-31 Time: 7.89 mins


100%|██████████| 395/395 [01:16<00:00,  5.17it/s]


2015-01-01 2019-12-31 Time: 8.13 mins


100%|██████████| 395/395 [01:23<00:00,  4.72it/s]


2020-01-01 2020-12-31 Time: 1.76 mins


100%|██████████| 395/395 [01:17<00:00,  5.12it/s]


swe
timesteps: 14965
1980-01-01 1984-12-31 Time: 7.12 mins


100%|██████████| 395/395 [00:25<00:00, 15.56it/s]


1985-01-01 1989-12-31 Time: 7.53 mins


100%|██████████| 395/395 [00:26<00:00, 14.83it/s] 


1990-01-01 1994-12-31 Time: 7.42 mins


100%|██████████| 395/395 [00:28<00:00, 13.63it/s]


1995-01-01 1999-12-31 Time: 7.41 mins


100%|██████████| 395/395 [00:33<00:00, 11.73it/s]


2000-01-01 2004-12-31 Time: 7.11 mins


100%|██████████| 395/395 [00:33<00:00, 11.68it/s]


2005-01-01 2009-12-31 Time: 7.29 mins


100%|██████████| 395/395 [00:36<00:00, 10.75it/s]


2010-01-01 2014-12-31 Time: 7.46 mins


100%|██████████| 395/395 [00:39<00:00,  9.91it/s]


2015-01-01 2019-12-31 Time: 7.56 mins


100%|██████████| 395/395 [00:41<00:00,  9.42it/s]


2020-01-01 2020-12-31 Time: 1.65 mins


100%|██████████| 395/395 [00:35<00:00, 11.14it/s]


tmax
timesteps: 14965
1980-01-01 1984-12-31 Time: 7.09 mins


100%|██████████| 395/395 [00:32<00:00, 12.19it/s]


1985-01-01 1989-12-31 Time: 7.63 mins


100%|██████████| 395/395 [00:36<00:00, 10.92it/s]


1990-01-01 1994-12-31 Time: 7.48 mins


100%|██████████| 395/395 [00:46<00:00,  8.54it/s]


1995-01-01 1999-12-31 Time: 7.39 mins


100%|██████████| 395/395 [00:52<00:00,  7.57it/s]


2000-01-01 2004-12-31 Time: 7.04 mins


100%|██████████| 395/395 [01:01<00:00,  6.41it/s]


2005-01-01 2009-12-31 Time: 7.44 mins


100%|██████████| 395/395 [01:08<00:00,  5.77it/s]


2010-01-01 2014-12-31 Time: 7.42 mins


100%|██████████| 395/395 [01:18<00:00,  5.05it/s]


2015-01-01 2019-12-31 Time: 7.84 mins


100%|██████████| 395/395 [01:23<00:00,  4.71it/s]


2020-01-01 2020-12-31 Time: 1.76 mins


100%|██████████| 395/395 [01:19<00:00,  4.96it/s]


tmin
timesteps: 14965
1980-01-01 1984-12-31 Time: 7.37 mins


100%|██████████| 395/395 [00:31<00:00, 12.60it/s]


1985-01-01 1989-12-31 Time: 8.02 mins


100%|██████████| 395/395 [00:36<00:00, 10.73it/s]


1990-01-01 1994-12-31 Time: 7.65 mins


100%|██████████| 395/395 [00:44<00:00,  8.90it/s]


1995-01-01 1999-12-31 Time: 7.37 mins


100%|██████████| 395/395 [00:52<00:00,  7.49it/s]


2000-01-01 2004-12-31 Time: 7.11 mins


100%|██████████| 395/395 [01:00<00:00,  6.56it/s]


2005-01-01 2009-12-31 Time: 7.45 mins


100%|██████████| 395/395 [01:09<00:00,  5.70it/s]


2010-01-01 2014-12-31 Time: 7.62 mins


100%|██████████| 395/395 [01:17<00:00,  5.07it/s]


2015-01-01 2019-12-31 Time: 7.85 mins


100%|██████████| 395/395 [01:25<00:00,  4.64it/s]


2020-01-01 2020-12-31 Time: 1.96 mins


100%|██████████| 395/395 [01:19<00:00,  4.97it/s]


vp
timesteps: 14965
1980-01-01 1984-12-31 Time: 9.54 mins


100%|██████████| 395/395 [00:33<00:00, 11.89it/s]


1985-01-01 1989-12-31 Time: 9.55 mins


100%|██████████| 395/395 [00:37<00:00, 10.58it/s]


1990-01-01 1994-12-31 Time: 8.80 mins


100%|██████████| 395/395 [00:45<00:00,  8.63it/s]


1995-01-01 1999-12-31 Time: 8.49 mins


100%|██████████| 395/395 [00:52<00:00,  7.55it/s]


2000-01-01 2004-12-31 Time: 8.10 mins


100%|██████████| 395/395 [01:00<00:00,  6.48it/s]


2005-01-01 2009-12-31 Time: 8.93 mins


100%|██████████| 395/395 [01:07<00:00,  5.89it/s]


2010-01-01 2014-12-31 Time: 8.86 mins


100%|██████████| 395/395 [01:14<00:00,  5.31it/s]


2015-01-01 2019-12-31 Time: 9.05 mins


100%|██████████| 395/395 [01:25<00:00,  4.61it/s]


2020-01-01 2020-12-31 Time: 1.84 mins


100%|██████████| 395/395 [01:18<00:00,  5.03it/s]


dayl
timesteps: 14965
1980-01-01 1984-12-31 Time: 6.93 mins


100%|██████████| 395/395 [00:31<00:00, 12.68it/s]


1985-01-01 1989-12-31 Time: 7.44 mins


100%|██████████| 395/395 [00:34<00:00, 11.37it/s]


1990-01-01 1994-12-31 Time: 7.50 mins


100%|██████████| 395/395 [00:38<00:00, 10.31it/s]


1995-01-01 1999-12-31 Time: 7.47 mins


100%|██████████| 395/395 [00:45<00:00,  8.72it/s]


2000-01-01 2004-12-31 Time: 7.10 mins


100%|██████████| 395/395 [00:50<00:00,  7.86it/s]


2005-01-01 2009-12-31 Time: 7.45 mins


100%|██████████| 395/395 [00:56<00:00,  6.95it/s]


2010-01-01 2014-12-31 Time: 7.85 mins


100%|██████████| 395/395 [01:02<00:00,  6.30it/s]


2015-01-01 2019-12-31 Time: 7.87 mins


100%|██████████| 395/395 [01:07<00:00,  5.82it/s]


2020-01-01 2020-12-31 Time: 1.70 mins


100%|██████████| 395/395 [01:04<00:00,  6.17it/s]


In [14]:
var_names = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'dayl']
for idx, row in tqdm.tqdm(camels_graph.iterrows()):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    for var_name in var_names:
        data = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        # Fill the NaN values with a window of 15 days centered around the missing value
        for col in data.columns:
            data[col] = data[col].fillna(data[col].rolling(15, min_periods = 1, center = True).mean())
        data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"))

395it [59:52,  9.10s/it]


## Terrain Attributes

In [10]:
from shapely.geometry import Polygon
import rioxarray

def coords_to_polygon(lon, lat, resolution):
    half_res = resolution / 2
    return Polygon([
        (round(lon - half_res,3), round(lat - half_res,3)),
        (round(lon - half_res,3), round(lat + half_res,3)),
        (round(lon + half_res,3), round(lat + half_res,3)),
        (round(lon + half_res,3), round(lat - half_res,3))
    ])
def tile_filename_to_coords(filename):
    # format: n/s{dd}e/w{ddd}_elv.tif
    # n/e: positive, s/w: negative
    n_s, lat, e_w, lon = filename[0], int(filename[1:3]), filename[3], int(filename[4:7])
    lat = lat if n_s == 'n' else -lat
    lon = lon if e_w == 'e' else -lon
    return (lon, lat)

In [40]:
import itertools
var_names = ['elv', 'slope_percentage', 'slope_riserun', 'slope_degrees', 'slope_radians', 'aspect', 'curvature', 'planform_curvature', 'profile_curvature', 'upa', 'wth']
# valid_tiles = ['n30w150', 'n30w120', 'n30w090']

issues = []
for var_name in itertools.islice(var_names,1,None,1):
    print(var_name)
    tiles_paths = sorted(glob.glob(os.path.join(PATHS['MERIT-Hydro'], var_name, '**', '*.tif'), recursive=True))
    # tiles_paths = [tile for tile in tiles_paths if os.path.basename(os.path.dirname(tile)).split('_')[-1] in valid_tiles]
    tiles_filenames = [os.path.basename(tile) for tile in tiles_paths]
    tiles_names = [tile.split('_')[0] for tile in tiles_filenames]
    tiles_lower_left_corner = [tile_filename_to_coords(tile) for tile in tiles_filenames]
    tiles_polygons = [Polygon([(lon, lat), (lon + 5, lat), (lon + 5, lat + 5), (lon, lat + 5)]) for lon, lat in tiles_lower_left_corner]

    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = ['mean', 'std', '25%', '50%', '75%'])
        cell_polygons = [coords_to_polygon(row['lon'], row['lat'], resolution) for _, row in nodes_coords.iterrows()]
        catmt_polygon = cell_polygons[0]
        for polygon in cell_polygons[1:]:
            catmt_polygon = catmt_polygon.union(polygon)
        intersected_tiles = []
        for tile_polygon, tile_path in zip(tiles_polygons, tiles_paths):
            if tile_polygon.intersects(catmt_polygon):
                intersected_tiles.append(tile_path)
        ds = rioxarray.open_rasterio(intersected_tiles[0])
        for tile in intersected_tiles[1:]:
            ds = ds.combine_first(rioxarray.open_rasterio(tile))
        ds = ds.sel(band=1)
        # Sort the x and y coordinates to be ascending
        ds = ds.sortby('x', ascending=True)
        ds = ds.sortby('y', ascending=True)
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            # ds_node = ds.rio.clip_box(lon - resolution/2, lat - resolution/2, lon + resolution/2, lat + resolution/2)
            ds_node = ds.sel(x = slice(lon - resolution/2, lon + resolution/2), y = slice(lat - resolution/2, lat + resolution/2))
            ds_node = ds_node.where(ds_node != ds.rio.nodata)
            ds_node_values = ds_node.values.flatten()
            mean = np.nanmean(ds_node_values)
            std = np.nanstd(ds_node_values)
            q25 = np.nanquantile(ds_node_values, 0.25)
            q50 = np.nanquantile(ds_node_values, 0.50)
            q75 = np.nanquantile(ds_node_values, 0.75)
            data.loc['mean', node_idx] = mean
            data.loc['std', node_idx] = std
            data.loc['25%', node_idx] = q25
            data.loc['50%', node_idx] = q50
            data.loc['75%', node_idx] = q75
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static'), exist_ok = True)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'MERIT-Hydro'), exist_ok = True)
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'MERIT-Hydro', f"{var_name}.csv"))

        ds.close()
        del ds
        gc.collect()

    for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
        try:
            process(idx, row)
        except Exception as e:
            issues.append(f"{var_name}-{row['huc_02']}-{row.name}")
            print(f"Error: {var_name}-{row['huc_02']}-{row.name}. {e}")

slope_percentage


100%|██████████| 395/395 [04:08<00:00,  1.59it/s]


slope_riserun


100%|██████████| 395/395 [04:16<00:00,  1.54it/s]


slope_degrees


100%|██████████| 395/395 [04:07<00:00,  1.59it/s]


slope_radians


100%|██████████| 395/395 [04:05<00:00,  1.61it/s]


aspect


100%|██████████| 395/395 [04:08<00:00,  1.59it/s]


curvature


100%|██████████| 395/395 [04:15<00:00,  1.54it/s]


planform_curvature


100%|██████████| 395/395 [04:13<00:00,  1.56it/s]


profile_curvature


100%|██████████| 395/395 [04:41<00:00,  1.41it/s]


upa


100%|██████████| 395/395 [05:00<00:00,  1.32it/s]


wth


100%|██████████| 395/395 [04:04<00:00,  1.62it/s]


In [41]:
len(issues)

0

In [42]:
issues_df = [entry.split('-') for entry in issues]
issues_df = pd.DataFrame(issues_df, columns = ['var_name', 'huc_02', 'gauge_id'])
issues_df

Unnamed: 0,var_name,huc_02,gauge_id


In [43]:
issues_df[issues_df['var_name'] == 'elv']

Unnamed: 0,var_name,huc_02,gauge_id


## Spatial Encodings

In [11]:
def process(idx, row):
    # lon: -180 to 180; lat: -60 to 90
    lon_transform = lambda x: np.sin(2 * np.pi * (x+180) / 360)
    lat_transform = lambda x: (x - (-60))/(90 - (-60))

    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)

    data = pd.DataFrame(columns = nodes_coords.index, index = ['lon_transformed', 'lat_transformed'])
    for node_idx, node_row in nodes_coords.iterrows():
        lat, lon = node_row['lat'], node_row['lon']
        data.loc['lon_transformed', node_idx] = lon_transform(lon)
        data.loc['lat_transformed', node_idx] = lat_transform(lat)

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'spatial_encodings.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

100%|██████████| 395/395 [00:01<00:00, 343.82it/s]


## uparea

In [13]:
uparea = xr.open_dataset(os.path.join(PATHS['gis_ldd'], 'GloFAS_03min/upstream_area_km2.nc'))
ds_varname = list(uparea.data_vars)[0]
uparea = uparea[ds_varname]
uparea = uparea.sel(
    lat = slice(region_bounds['maxy'], region_bounds['miny']), 
    lon = slice(region_bounds['minx'], region_bounds['maxx'])
)
uparea.load()

def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)

    data = pd.DataFrame(columns = nodes_coords.index, index = [0])
    for node_idx, node_row in nodes_coords.iterrows():
        lat, lon = node_row['lat'], node_row['lon']
        data.loc[0, node_idx] = uparea.sel(lat = lat, lon = lon, method = 'nearest').values.item()

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'uparea.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

uparea.close()
del uparea
gc.collect()

100%|██████████| 395/395 [00:02<00:00, 192.88it/s]


49

## USGS

In [21]:
def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    uparea = row['area_geospa_fabric']

    usgs_filepath = os.path.join(PATHS['USGS'], 'CAMELS-US', huc, 'csv', f'{gauge_id}.csv')
    usgs_data = pd.read_csv(usgs_filepath, index_col = 0, parse_dates = True)
    usgs_data.columns = ['Q_ft3s']
    usgs_data['Q_mm'] = ((usgs_data['Q_ft3s'] / (3.28084**3)) / (uparea * 1e6)) * (3600*24*1000)

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    usgs_data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'USGS.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

100%|██████████| 395/395 [00:03<00:00, 105.21it/s]


  2%|▏         | 8/395 [00:19<00:08, 44.93it/s]

## GloFAS Discharge in mm

In [31]:
def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name

    glofas_filepath = os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'GloFAS', 'discharge.csv')
    glofas_data = pd.read_csv(glofas_filepath, index_col = 0, parse_dates = True)
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    uparea = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'uparea.csv'), index_col = 0)
    glofas_Q_mm = glofas_data.copy()
    for node_idx, node_row in nodes_coords.iterrows():
        uparea_node = uparea.loc[0, str(node_idx)] * 1e6
        glofas_Q_mm[str(node_idx)] = (glofas_data[str(node_idx)] / uparea_node) * (3600*24*1000)
    glofas_Q_mm.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'GloFAS', 'discharge_mm.csv'), index = True)

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

100%|██████████| 395/395 [01:03<00:00,  6.22it/s]


## GloFAS Parameter Maps

In [44]:
# "Catchment_morphology_and_river_network" (14 surface fields)
# - chanbnkf_Global_03min.nc (channel bankfull depth, m);
# - chanflpn_Global_03min.nc (width of the floodplain, m);
# - changrad_Global_03min.nc (channel longitudinal gradient, m/m);
# - chanlength_Global_03min.nc (channel length within a pixel, m);
# - chanman_Global_03min.nc (channel Manning's roughness coefficient, m^(1/3)s^(-1));
# - chans_Global_03min.nc (channel side slope, m/m);
# - chanbw_Global_03min.nc (channel bottom width, m):

# "Land_use" (7 surface fields)
# - fracforest_Global_03min.nc (fraction of forest for each grid-cell, -);
# - fracirrigated_Global_03min.nc (fraction of irrigated crops [except rice] for each grid-cell, -);
# - fracrice_Global_03min.nc (fraction of rice crops for each grid-cell, -);
# - fracsealed_Global_03min.nc (fraction of urban area for each grid-cell, -);
# - fracwater_Global_03min.nc (fraction of inland water for each grid-cell, -);
# - fracother_Global_03min.nc (fraction of other land cover for each grid-cell, -);
Parameter_Maps = os.path.join(PATHS['GloFAS'], 'LISFLOOD_Parameter_Maps')

var_names = ['chanbnkf', 'chanflpn', 'changrad', 'chanlength', 'chanman', 'chans', 'chanbw']
for var_name in var_names:
    print(var_name)
    ds = xr.open_dataset(os.path.join(Parameter_Maps, 'Catchments_morphology_and_river_network', f"{var_name}_Global_03min.nc"))['Band1']
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    ds.load()

    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[0, node_idx] = ds_window_loc.values.item()
        os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS'), exist_ok = True)
        data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

    ds.close()
    del ds
    gc.collect()

var_names = ['fracforest', 'fracirrigated', 'fracrice', 'fracsealed', 'fracwater', 'fracother']
for var_name in var_names:
    print(var_name)
    ds = xr.open_dataset(os.path.join(Parameter_Maps, 'Land_use', f"{var_name}_Global_03min.nc"))['Band1']
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    ds.load()

    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[0, node_idx] = ds_window_loc.values.item()
        os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS'), exist_ok = True)
        data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

chanbnkf




100%|██████████| 395/395 [00:01<00:00, 224.04it/s]
  4%|▍         | 15/395 [01:55<48:46,  7.70s/it]


chanflpn


100%|██████████| 395/395 [00:01<00:00, 259.28it/s]


changrad


100%|██████████| 395/395 [00:01<00:00, 258.56it/s]


chanlength


100%|██████████| 395/395 [00:01<00:00, 243.67it/s]


chanman


100%|██████████| 395/395 [00:01<00:00, 269.77it/s]


chans


100%|██████████| 395/395 [00:01<00:00, 255.88it/s]


chanbw


100%|██████████| 395/395 [00:01<00:00, 259.63it/s]


fracforest


100%|██████████| 395/395 [00:01<00:00, 277.56it/s]


fracirrigated


100%|██████████| 395/395 [00:01<00:00, 274.07it/s]


fracrice


100%|██████████| 395/395 [00:01<00:00, 273.37it/s]


fracsealed


100%|██████████| 395/395 [00:01<00:00, 279.58it/s]


fracwater


100%|██████████| 395/395 [00:01<00:00, 281.31it/s]


fracother


100%|██████████| 395/395 [00:01<00:00, 242.72it/s]


## Cell Area

In [15]:
Parameter_Maps = os.path.join(PATHS['GloFAS'], 'LISFLOOD_Parameter_Maps')
ds = xr.open_dataset(os.path.join(Parameter_Maps, 'Main', 'pixarea_Global_03min.nc'))['Band1'] / 1e6
ds = ds.sel(
    lat = slice(region_bounds['maxy'], region_bounds['miny']), 
    lon = slice(region_bounds['minx'], region_bounds['maxx'])
)
ds.load()
var_name = 'cellarea_km2'
def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(columns = nodes_coords.index, index = [0])
    for node_idx, node_row in nodes_coords.iterrows():
        lat, lon = node_row['lat'], node_row['lon']
        ds_window_loc = ds.sel(lat = lat, lon = lon, method = 'nearest')
        data.loc[0, node_idx] = ds_window_loc.values.item()
    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS'), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS', f"{var_name}.csv"))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

ds.close()
del ds
gc.collect()

100%|██████████| 395/395 [00:02<00:00, 174.82it/s]


29