# Setting Up

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe
import networkx as nx

import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt

from shapely.geometry import Point
from shapely.geometry import Polygon

import glob
import os
import itertools
import tqdm
import gc
import time
import pickle

from joblib import Parallel, delayed

In [2]:
import configparser
cfg = configparser.ConfigParser()
cfg.optionxform = str
cfg.read('/home/sarth/rootdir/assets/global.ini')
cfg = {s: dict(cfg.items(s)) for s in cfg.sections()}
PATHS = cfg['PATHS']

In [3]:
DIRNAME = '30min_CWatM_CAMELS-US'
SAVE_PATH = os.path.join(PATHS['devp_datasets'], DIRNAME)
resolution = 0.50
lon_360_180 = lambda x: (x + 180) % 360 - 180 # convert 0-360 to -180-180
lon_180_360 = lambda x: x % 360 # convert -180-180 to 0-360
region_bounds = {
    'minx': -130,
    'miny': 20,
    'maxx': -65,
    'maxy': 50
}

# Load Watershed Attributes

In [6]:
camels_attributes_graph = pd.read_csv(os.path.join(SAVE_PATH, 'graph_attributes.csv'), index_col=0)
camels_attributes_graph.index = camels_attributes_graph.index.map(lambda x: str(x).zfill(8))
camels_attributes_graph['huc_02'] = camels_attributes_graph['huc_02'].map(lambda x: str(x).zfill(2))
camels_attributes_graph

Unnamed: 0_level_0,huc_02,gauge_lon,gauge_lat,area_geospa_fabric,snapped_lon,snapped_lat,snapped_uparea,snapped_iou,area_percent_difference,num_nodes,num_edges
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
06452000,10,-99.55649,43.74833,25817.78,-99.75,43.75,26974.6170,0.644380,4.480780,12.0,11.0
13340000,17,-116.25750,46.47833,14270.76,-115.75,46.25,12915.0530,0.666042,9.499894,6.0,5.0
06447000,10,-101.52487,43.75250,12869.46,-101.75,43.75,13514.7300,0.571147,5.013967,6.0,5.0
06360500,10,-100.84292,45.25582,12601.47,-101.25,45.25,13086.8330,0.774024,3.851640,6.0,5.0
06354000,10,-100.93444,46.37611,10626.74,-101.25,46.25,8571.6370,0.628915,19.338984,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...
14303200,17,-123.54650,45.32428,8.07,-123.75,45.25,2181.1390,0.003707,26927.744000,1.0,0.0
10336740,16,-119.93546,39.06658,7.94,-119.75,39.25,2395.8298,0.001653,30074.180000,2.0,1.0
01466500,02,-74.50528,39.88500,6.25,-74.25,39.75,2378.9302,0.002212,37962.883000,1.0,0.0
01594950,02,-79.39031,39.27669,6.10,-79.25,39.25,2395.8298,0.002549,39175.900000,1.0,0.0


In [7]:
camels_graph = camels_attributes_graph.copy()
camels_graph = camels_graph[camels_graph['area_percent_difference'] < 10]
print(camels_graph.shape)
camels_graph = camels_graph[camels_graph['num_nodes'] > 1]
print(camels_graph.shape)
# Print the number of graphs per 'huc_02' (sorted in values of huc_02)
camels_graph.sort_values(ascending=True, by = 'huc_02').groupby('huc_02').size()
# camels_graph['huc_02'].value_counts(sort=True)

(20, 11)
(10, 11)


huc_02
09    1
10    5
11    2
15    1
17    1
dtype: int64

In [8]:
camels_graph['area_geospa_fabric'].describe()

count       10.000000
mean      9668.537000
std       6872.941039
min       4344.130000
25%       4918.577500
50%       6060.605000
75%      12802.462500
max      25817.780000
Name: area_geospa_fabric, dtype: float64

In [9]:
camels_graph

Unnamed: 0_level_0,huc_02,gauge_lon,gauge_lat,area_geospa_fabric,snapped_lon,snapped_lat,snapped_uparea,snapped_iou,area_percent_difference,num_nodes,num_edges
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6452000,10,-99.55649,43.74833,25817.78,-99.75,43.75,26974.617,0.64438,4.48078,12.0,11.0
13340000,17,-116.2575,46.47833,14270.76,-115.75,46.25,12915.053,0.666042,9.499894,6.0,5.0
6447000,10,-101.52487,43.7525,12869.46,-101.75,43.75,13514.73,0.571147,5.013967,6.0,5.0
6360500,10,-100.84292,45.25582,12601.47,-101.25,45.25,13086.833,0.774024,3.85164,6.0,5.0
6191500,10,-110.79438,45.11188,6808.28,-110.75,45.25,6599.823,0.404993,3.061809,3.0,2.0
7068000,11,-90.84762,36.622,5312.93,-91.25,37.25,4923.1274,0.592192,7.336869,2.0,1.0
7226500,11,-103.52579,35.43838,5245.84,-103.75,35.75,5002.1724,0.505891,4.644966,2.0,1.0
9430500,15,-108.53727,33.06118,4809.49,-108.25,33.25,5153.1113,0.474348,7.144647,2.0,1.0
6353000,10,-101.33374,46.09167,4605.23,-102.25,46.25,4285.8184,0.341339,6.935845,2.0,1.0
5131500,9,-93.54933,48.39578,4344.13,-93.25,47.75,4168.617,0.485655,4.040227,2.0,1.0


In [10]:
del camels_attributes_graph

# Create Node Features as csv

In [11]:
os.makedirs(os.path.join(SAVE_PATH, "graph_features"), exist_ok = True)

In [12]:
ldd = xr.open_dataset(os.path.join(PATHS['gis_ldd'], 'CWatM_30min', 'ldd.nc'))
ldd = ldd['ldd']
ldd = ldd.sel(
    lat = slice(region_bounds['maxy'], region_bounds['miny']), 
    lon = slice(region_bounds['minx'], region_bounds['maxx'])
)

lons = ldd['lon'].values
lats = ldd['lat'].values

ds_grid = xr.Dataset({
    'lat': (['lat'], lats),
    'lon': (['lon'], lons),
})

# Round the lat lon values to 3 decimal places in ds_grid
ds_grid['lat'] = ds_grid['lat'].round(3)
ds_grid['lon'] = ds_grid['lon'].round(3)

## ERA5

### Dynamic

In [11]:
var_names = [
    '2m_temperature', 
    'evaporation', 
    'snowfall', 
    'surface_net_solar_radiation', 
    'surface_net_thermal_radiation', 
    'surface_pressure', 
    'total_precipitation',
    '2m_dewpoint_temperature',
    '10m_u_component_of_wind',
    '10m_v_component_of_wind',
    'forecast_albedo',
    'potential_evaporation',
    'runoff',
    'snow_albedo',
    'snow_depth',
    'snowmelt',
    'sub_surface_runoff',
    'surface_runoff',
    'total_column_water',
    'volumetric_soil_water_layer_1',
    'volumetric_soil_water_layer_2',
    'volumetric_soil_water_layer_3',
    'volumetric_soil_water_layer_4'
]

dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5'), exist_ok = True)
    data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

Number of dates: 14965
2m_temperature


100%|██████████| 10/10 [00:00<00:00, 44.73it/s]


evaporation


100%|██████████| 10/10 [00:00<00:00, 2891.43it/s]


snowfall


100%|██████████| 10/10 [00:00<00:00, 3690.54it/s]

surface_net_solar_radiation



100%|██████████| 10/10 [00:00<00:00, 3182.57it/s]


surface_net_thermal_radiation


100%|██████████| 10/10 [00:00<00:00, 3087.68it/s]


surface_pressure


100%|██████████| 10/10 [00:00<00:00, 3884.33it/s]


total_precipitation


100%|██████████| 10/10 [00:00<00:00, 3803.67it/s]


2m_dewpoint_temperature


100%|██████████| 10/10 [00:00<00:00, 3809.19it/s]


10m_u_component_of_wind


100%|██████████| 10/10 [00:00<00:00, 3597.48it/s]


10m_v_component_of_wind


100%|██████████| 10/10 [00:00<00:00, 3750.61it/s]


forecast_albedo


100%|██████████| 10/10 [00:00<00:00, 2941.10it/s]


potential_evaporation


100%|██████████| 10/10 [00:00<00:00, 3428.68it/s]


runoff


100%|██████████| 10/10 [00:00<00:00, 3456.08it/s]


snow_albedo


100%|██████████| 10/10 [00:00<00:00, 3613.29it/s]


snow_depth


100%|██████████| 10/10 [00:00<00:00, 3604.90it/s]


snowmelt


100%|██████████| 10/10 [00:00<00:00, 3191.53it/s]


sub_surface_runoff


100%|██████████| 10/10 [00:00<00:00, 3303.12it/s]


surface_runoff


100%|██████████| 10/10 [00:00<00:00, 2703.91it/s]


total_column_water


100%|██████████| 10/10 [00:00<00:00, 3526.11it/s]


volumetric_soil_water_layer_1


100%|██████████| 10/10 [00:00<00:00, 3310.94it/s]


volumetric_soil_water_layer_2


100%|██████████| 10/10 [00:00<00:00, 3808.85it/s]


volumetric_soil_water_layer_3


100%|██████████| 10/10 [00:00<00:00, 3122.16it/s]


volumetric_soil_water_layer_4


100%|██████████| 10/10 [00:00<00:00, 3766.10it/s]


In [12]:
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_mfdataset(os.path.join(PATHS['RawData'], 'ERA5', var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.rename({'longitude': 'lon', 'latitude': 'lat'})
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds['lon'] = [lon_360_180(lon) for lon in ds['lon'].values]
    ds = ds.sortby('lon')
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    _, index = np.unique(ds['time'], return_index = True)
    ds = ds.isel(time = index)

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_era5_to_cwatm_30min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_era5_to_cwatm_30min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_era5_to_cwatm_30min.nc'))
    
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'Time: {((end_time - start_time) / 60):.4f} mins')
    
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds_regrided.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[:, str(node_idx)] = ds_window_loc.values
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'ERA5', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

    ds.close()
    del ds
    gc.collect()

2m_temperature
Time: 5.8555 mins


100%|██████████| 10/10 [00:00<00:00, 21.31it/s]


evaporation
Time: 7.4110 mins


100%|██████████| 10/10 [00:02<00:00,  4.75it/s]


snowfall
Time: 6.6407 mins


100%|██████████| 10/10 [00:01<00:00,  8.62it/s]


surface_net_solar_radiation
Time: 6.5198 mins


100%|██████████| 10/10 [00:01<00:00,  8.68it/s]


surface_net_thermal_radiation
Time: 5.5614 mins


100%|██████████| 10/10 [00:00<00:00, 11.76it/s]


surface_pressure
Time: 6.6378 mins


100%|██████████| 10/10 [00:00<00:00, 12.37it/s]


total_precipitation
Time: 5.9092 mins


100%|██████████| 10/10 [00:00<00:00, 10.96it/s]


2m_dewpoint_temperature
Time: 4.6285 mins


100%|██████████| 10/10 [00:00<00:00, 8168.07it/s]


10m_u_component_of_wind
Time: 4.6888 mins


100%|██████████| 10/10 [00:00<00:00, 9547.70it/s]


10m_v_component_of_wind
Time: 4.6944 mins


100%|██████████| 10/10 [00:00<00:00, 8442.64it/s]


forecast_albedo
Time: 4.5303 mins


100%|██████████| 10/10 [00:00<00:00, 9400.05it/s]


potential_evaporation
Time: 4.4328 mins


100%|██████████| 10/10 [00:00<00:00, 8758.20it/s]


runoff
Time: 4.4816 mins


100%|██████████| 10/10 [00:00<00:00, 9619.96it/s]


snow_albedo
Time: 4.5534 mins


100%|██████████| 10/10 [00:00<00:00, 8360.18it/s]


snow_depth
Time: 4.4701 mins


100%|██████████| 10/10 [00:00<00:00, 10207.60it/s]


snowmelt
Time: 4.5027 mins


100%|██████████| 10/10 [00:00<00:00, 8859.96it/s]


sub_surface_runoff
Time: 4.4199 mins


100%|██████████| 10/10 [00:00<00:00, 7767.23it/s]


surface_runoff
Time: 4.4533 mins


100%|██████████| 10/10 [00:00<00:00, 6375.29it/s]


total_column_water
Time: 4.5690 mins


100%|██████████| 10/10 [00:00<00:00, 8623.16it/s]


volumetric_soil_water_layer_1
Time: 4.4949 mins


100%|██████████| 10/10 [00:00<00:00, 6144.60it/s]


volumetric_soil_water_layer_2
Time: 4.4934 mins


100%|██████████| 10/10 [00:00<00:00, 8024.30it/s]


volumetric_soil_water_layer_3
Time: 4.4980 mins


100%|██████████| 10/10 [00:00<00:00, 7835.43it/s]


volumetric_soil_water_layer_4
Time: 4.5548 mins


100%|██████████| 10/10 [00:00<00:00, 11140.25it/s]


### Static

In [13]:
var_names = [
    'static_soil_type', 
    'static_high_vegetation_cover', 
    'static_low_vegetation_cover', 
    'static_type_of_high_vegetation', 
    'static_type_of_low_vegetation'
    ]
ds_filenames = [
    'soil_type_static.nc',
    'high_vegetation_cover_static.nc',
    'low_vegetation_cover_static.nc',
    'type_of_high_vegetation_static.nc',
    'type_of_low_vegetation_static.nc'
]

for var_name, ds_filename in zip(var_names, ds_filenames):
    print(var_name)
    ds = xr.open_dataset(os.path.join(PATHS['RawData'], 'ERA5', ds_filename))
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.isel(time = 0)
    ds = ds.drop('time')
    ds = ds.rename({'longitude': 'lon', 'latitude': 'lat'})
    ds['lon'] = [lon_360_180(lon) for lon in ds['lon'].values]
    ds = ds.sortby('lon')
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[0, node_idx] = int(ds_window_loc.values)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static'), exist_ok = True)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'ERA5'), exist_ok = True)
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'ERA5', f"{var_name}.csv"))

    for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
        process(idx, row)

static_soil_type


100%|██████████| 10/10 [00:00<00:00, 25.45it/s]


static_high_vegetation_cover


100%|██████████| 10/10 [00:00<00:00, 76.12it/s]


static_low_vegetation_cover


100%|██████████| 10/10 [00:00<00:00, 121.93it/s]


static_type_of_high_vegetation


100%|██████████| 10/10 [00:00<00:00, 123.23it/s]


static_type_of_low_vegetation


100%|██████████| 10/10 [00:00<00:00, 171.07it/s]


## HWSD

In [14]:
var_names = ['S_CLAY', 'S_GRAVEL', 'S_SAND', 'S_SILT', 'T_CLAY', 'T_GRAVEL', 'T_SAND', 'T_SILT']

for var_name in var_names:
    print(var_name)
    ds = xr.open_dataset(os.path.join(PATHS['HWSD'], f'{var_name}.nc4'))
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(
        lat = slice(region_bounds['miny'], region_bounds['maxy']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    ds = ds / 100
    ds.load()
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds.sel(
                lat = slice(lat-resolution/2, lat+resolution/2),
                lon = slice(lon-resolution/2, lon+resolution/2)
            ).values.mean()
            data.loc[0, node_idx] = ds_window_loc
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static'), exist_ok = True)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'HWSD'), exist_ok = True)
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'HWSD', f"{var_name}.csv"))

    for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
        process(idx, row)
    
    ds.close()
    del ds
    gc.collect()

S_CLAY


100%|██████████| 10/10 [00:00<00:00, 282.40it/s]


S_GRAVEL


100%|██████████| 10/10 [00:00<00:00, 281.14it/s]


S_SAND


100%|██████████| 10/10 [00:00<00:00, 342.88it/s]


S_SILT


100%|██████████| 10/10 [00:00<00:00, 342.92it/s]


T_CLAY


100%|██████████| 10/10 [00:00<00:00, 314.72it/s]


T_GRAVEL


100%|██████████| 10/10 [00:00<00:00, 358.66it/s]


T_SAND


100%|██████████| 10/10 [00:00<00:00, 354.51it/s]


T_SILT


100%|██████████| 10/10 [00:00<00:00, 358.98it/s]


## GLEAM

In [15]:
var_names = ['Ep', 'SMroot', 'SMsurf']

dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM'), exist_ok = True)
    data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

Number of dates: 14965
Ep


100%|██████████| 10/10 [00:00<00:00, 13617.87it/s]


SMroot


100%|██████████| 10/10 [00:00<00:00, 1390.36it/s]


SMsurf


100%|██████████| 10/10 [00:00<00:00, 2895.22it/s]


In [16]:
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_mfdataset(os.path.join(PATHS['GLEAM'], var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_gleam_to_cwatm_30min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_gleam_to_cwatm_30min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_gleam_to_cwatm_30min.nc'))
    
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'Time: {((end_time - start_time) / 60):.4f} mins')
    
    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            ds_window_loc = ds_regrided.sel(lat = lat, lon = lon, method = 'nearest')
            data.loc[:, str(node_idx)] = ds_window_loc.values
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

    ds.close()
    del ds
    gc.collect()

Ep
Time: 0.8309 mins


100%|██████████| 10/10 [00:00<00:00, 7249.06it/s]


SMroot
Time: 0.8188 mins


100%|██████████| 10/10 [00:00<00:00, 9393.74it/s]


SMsurf
Time: 0.8206 mins


100%|██████████| 10/10 [00:00<00:00, 6557.70it/s]


### Fix NaNs

In [17]:
var_names = ['Ep', 'SMroot', 'SMsurf']
for var_name in var_names:
    ds = xr.open_mfdataset(os.path.join(PATHS['GLEAM'], var_name, f"*.nc"), combine='by_coords')
    ds_var_name = list(ds.data_vars)[0]
    ds = ds[ds_var_name]
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_gleam_to_cwatm_30min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_gleam_to_cwatm_30min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_gleam_to_cwatm_30min.nc'))
    ds_regrided = regridder(ds)
    ds.close()
    start_time = time.time()
    ds_regrided.load()
    end_time = time.time()
    print(f'{var_name} (Time: {((end_time - start_time) / 60):.4f} mins)')

    # Loop over catchments and find ones with issues
    issues = []
    for idx, row in tqdm.tqdm(camels_graph.iterrows()):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        if data.isnull().values.any():
            issues.append([huc, gauge_id])
    issues = pd.DataFrame(issues, columns = ['huc_02', 'gauge_id'])
    print(f"Number of catchments with issues: {issues.shape[0]}")
    print("------")

    # Fix the catchments with issues
    for issue_idx, (huc, gauge_id) in enumerate(issues.values):
        print(issue_idx, huc, gauge_id)
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.read_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        nodes_coords['isNaN'] = False
        nodes_coords['nonNaNneighbours'] = 0
        # Loop over nodes and find the nodes with issues
        for node_idx in nodes_coords.index:
            if data[str(node_idx)].isnull().values.any():
                nodes_coords.loc[node_idx, 'isNaN'] = True
                node_lat = float(round(nodes_coords.loc[node_idx, 'lat'], 3))
                node_lon = float(round(nodes_coords.loc[node_idx, 'lon'], 3))
                multiplier = 1.5
                ds_slice = ds_regrided.sel(
                    lat = slice(node_lat+multiplier*resolution, node_lat-multiplier*resolution), 
                    lon = slice(node_lon-multiplier*resolution, node_lon+multiplier*resolution)
                    )
                slice_df = ds_slice.to_dataframe(name = var_name).reset_index()
                slice_df['lat'] = slice_df['lat'].round(3)
                slice_df['lon'] = slice_df['lon'].round(3)
                slice_df['location'] = list(zip(slice_df['lat'], slice_df['lon']))
                slice_df = slice_df.pivot(index='time', columns='location', values=var_name)
                num_nan_nodes = slice_df.isnull().any(axis=0).sum()
                num_nonnan_nodes = len(slice_df.columns) - num_nan_nodes
                nodes_coords.loc[node_idx, 'nonNaNneighbours'] = num_nonnan_nodes
        nodes_coords_sorted = nodes_coords.sort_values(by = 'nonNaNneighbours', ascending = False)
        nodes_coords_sorted = nodes_coords_sorted[nodes_coords_sorted['isNaN']]
        print(f"Number of nodes with NaN values: {nodes_coords_sorted.shape[0]}")
        
        for node_idx in tqdm.tqdm(nodes_coords_sorted.index):
            node_lat, node_lon = float(round(nodes_coords.loc[node_idx, 'lat'], 3)), float(round(nodes_coords.loc[node_idx, 'lon'], 3))
            multiplier = 1.5
            ds_slice = ds_regrided.sel(
                lat = slice(node_lat+multiplier*resolution, node_lat-multiplier*resolution), 
                lon = slice(node_lon-multiplier*resolution, node_lon+multiplier*resolution)
                )
            slice_df = ds_slice.to_dataframe(name = var_name).reset_index()
            slice_df['lat'] = slice_df['lat'].round(3)
            slice_df['lon'] = slice_df['lon'].round(3)
            slice_df['location'] = list(zip(slice_df['lat'], slice_df['lon']))
            slice_df = slice_df.pivot(index='time', columns='location', values=var_name)
            slice_df.columns = list(map(str, slice_df.columns))
            num_nonnan_nodes = len(slice_df.columns) - slice_df.isnull().any(axis=0).sum()
            # print(node_idx, (node_lat, node_lon), num_nonnan_nodes)
            if num_nonnan_nodes == 9:
                replacement_values = slice_df.loc[:, f"({node_lat}, {node_lon})"]
                data.loc[:, str(node_idx)] = replacement_values
                nodes_coords_sorted.loc[node_idx, 'isNaN'] = False
            elif num_nonnan_nodes > 0:
                replacement_values = np.nanmean(slice_df, axis = 1)
                data.loc[:, str(node_idx)] = replacement_values
                ds_regrided.loc[dict(lat = node_lat, lon = node_lon)] = replacement_values
                nodes_coords_sorted.loc[node_idx, 'isNaN'] = False
        print(f"Number of nodes with NaN values: {nodes_coords_sorted['isNaN'].sum()}")
        print(issue_idx, huc, gauge_id, data.isnull().values.any())
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'dynamic', 'GLEAM', f"{var_name}.csv"))
        print("------")

Ep (Time: 0.4826 mins)


10it [00:00, 52.39it/s]

Number of catchments with issues: 1
------
0 10 06191500





Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00,  9.85it/s]

Number of nodes with NaN values: 0
0 10 06191500 False





------
SMroot (Time: 0.4898 mins)


10it [00:00, 54.05it/s]

Number of catchments with issues: 1
------
0 10 06191500





Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 10.22it/s]

Number of nodes with NaN values: 0
0 10 06191500 False





------
SMsurf (Time: 0.4872 mins)


10it [00:00, 51.78it/s]

Number of catchments with issues: 1
------
0 10 06191500





Number of nodes with NaN values: 1


100%|██████████| 1/1 [00:00<00:00, 10.04it/s]

Number of nodes with NaN values: 0
0 10 06191500 False





------


## Solar Insolation

In [18]:
def solar_insolation(lat, lon, start_date, end_date):
    # Constants
    Sc = 1361  # Solar constant (W/m^2)
    
    # Convert dates to datetime objects
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    # Generate date range
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    dates = dates[~((dates.month == 2) & (dates.day == 29))]
    
    # Function to calculate solar declination
    def solar_declination(n):
        return 23.45 * np.sin(np.radians((360 / 365) * (n - 81)))

    # Function to calculate cos(theta_z) for solar zenith angle
    def cos_theta_z(lat, decl, hour_angle):
        lat_rad = np.radians(lat)
        decl_rad = np.radians(decl)
        return (np.sin(lat_rad) * np.sin(decl_rad) + 
                np.cos(lat_rad) * np.cos(decl_rad) * np.cos(np.radians(hour_angle)))
    
    # Function to calculate the hour angle
    def hour_angle(lon, date):
        # Assuming solar noon (local solar time = 12 hours)
        return 0  # hour angle at solar noon
    
    # Calculate solar insolation for each day
    insolation_values = []
    for date in dates:
        day_of_year = date.day_of_year
        declination = solar_declination(day_of_year)
        h = hour_angle(lon, date)
        cos_zenith_angle = cos_theta_z(lat, declination, h)
        
        # Insolation formula
        insolation = Sc * (1 + 0.033 * np.cos(np.radians(360 * day_of_year / 365))) * cos_zenith_angle
        
        # Make sure insolation is non-negative
        insolation = max(insolation, 0)
        insolation_values.append(insolation)
    
    # Create pandas Series
    insolation_series = pd.Series(insolation_values, index=dates, name='Solar Insolation (kW/m²)')
    insolation_series = insolation_series / 1000  # Convert to kW/m²
    
    return insolation_series

In [19]:
dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]

def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)

    data = pd.DataFrame(columns = nodes_coords.index, index = dates)
    for node_idx, node_row in nodes_coords.iterrows():
        lat, lon = node_row['lat'], node_row['lon']
        ds_window_loc = solar_insolation(lat, lon, '1980-01-01', '2020-12-31')
        data.loc[:, node_idx] = ds_window_loc.values

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'solar_insolation.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

100%|██████████| 10/10 [00:00<00:00, 6254.55it/s]


## Time Encodings

In [20]:
def sine_time_encoding(start_date, end_date):
    # (a) Create a date_range and remove leap days
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    dates = dates[~((dates.month == 2) & (dates.day == 29))]  # Remove February 29 (leap days)
    
    # (b) Create a dataframe with 'month', 'weekofyear', 'dayofyear' columns
    df = pd.DataFrame(index=dates)
    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['dayofyear'] = df.index.dayofyear
    
    # (c) Define lambda transformations for sine encoding
    # For day of year (range 1-365), week of year (range 1-52), and month (range 1-12)
    sine_transform = lambda x, max_val: np.sin(2 * np.pi * x / max_val)
    
    # (d) Apply sine transformation and add transformed columns
    df['sine_month'] = df['month'].apply(sine_transform, max_val=12)
    df['sine_weekofyear'] = df['weekofyear'].apply(sine_transform, max_val=52)
    df['sine_dayofyear'] = df['dayofyear'].apply(sine_transform, max_val=365)
    
    # return df[['sine_month', 'sine_weekofyear', 'sine_dayofyear']]
    return df

In [21]:
df_encoded = sine_time_encoding('1980-01-01', '2020-12-31')

def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    df_encoded.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'time_encodings.csv'))

# with Parallel(n_jobs = 8, verbose = 0) as parallel:
    # _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
    process(idx, row)

100%|██████████| 10/10 [00:01<00:00,  8.25it/s]


## Daymet

In [11]:
dates = pd.date_range('1980-01-01', '2020-12-31', freq='D')
dates = dates[~((dates.month == 2) & (dates.day == 29))]
print(f"Number of dates: {len(dates)}")
var_names = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'dayl']

def process(idx, row, var_name):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    data = pd.DataFrame(index = dates, columns = nodes_coords.index)
    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic'), exist_ok = True)
    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet'), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"))

for var_name in var_names:
    print(var_name)
    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row, var_name) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

Number of dates: 14965
prcp


100%|██████████| 10/10 [00:00<00:00, 44.63it/s]


srad


100%|██████████| 10/10 [00:00<00:00, 2653.45it/s]


swe


100%|██████████| 10/10 [00:00<00:00, 2813.46it/s]


tmax


100%|██████████| 10/10 [00:00<00:00, 3394.00it/s]


tmin


100%|██████████| 10/10 [00:00<00:00, 3508.41it/s]


vp


100%|██████████| 10/10 [00:00<00:00, 3340.74it/s]


dayl


100%|██████████| 10/10 [00:00<00:00, 3459.51it/s]


In [12]:
missing_dates = ['1980-12-31', '1984-12-31', '1988-12-31', '1992-12-31',
               '1996-12-31', '2000-12-31', '2004-12-31', '2008-12-31',
               '2012-12-31', '2016-12-31', '2020-12-31']
missing_dates = [date + 'T12:00:00' for date in missing_dates]
missing_dates = np.array(missing_dates, dtype = 'datetime64')
ds_missing_dates = xr.DataArray(
    np.nan*np.zeros((len(missing_dates), len(lats), len(lons))),
    coords = [missing_dates, lats, lons],
    dims = ['time', 'lat', 'lon']
)

In [13]:
var_names = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'dayl']
for var_name in itertools.islice(var_names, 0, None, 1):
    print(var_name)
    ds = xr.open_mfdataset(os.path.join(PATHS['Daymet'], var_name, f"*.nc"), combine='by_coords')
    ds = ds[var_name]
    ds = ds.rename({'x': 'lon', 'y': 'lat'})
    ds = ds.sel(time=~((ds['time.month'] == 2) & (ds['time.day'] == 29)))
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )

    if os.path.exists(os.path.join(PATHS['Assets'], 'regridder_daymet_to_cwatm_30min.nc')):
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=True, 
            filename = os.path.join(PATHS['Assets'], 'regridder_daymet_to_cwatm_30min.nc')
        )
    else:
        regridder = xe.Regridder(
            ds, 
            ds_grid, 
            'bilinear', 
            reuse_weights=False
        )
        regridder.to_netcdf(os.path.join(PATHS['Assets'], 'regridder_daymet_to_cwatm_30min.nc'))
    
    ds_regrided = regridder(ds)
    # ds_regrided['time'] = ds_regrided['time'].dt.floor('D')
    # ds_regrided['time'] = [np.datetime64(str(date).split('T')[0]) for date in ds_regrided['time'].values]

    # Concatenate missing dates
    ds_regrided = xr.concat([ds_regrided, ds_missing_dates], dim = 'time')
    ds_regrided = ds_regrided.sortby('time')
    ds.close()
    # Print length of time
    print(f"timesteps: {len(ds_regrided['time'])}")

    for start_year in range(1980, 2020+1, 5):
        start_date = f"{start_year}-01-01"
        end_date = f"{min(start_year+4,2020)}-12-31"
        ds_window = ds_regrided.sel(time = slice(start_date, end_date)).copy()
        start_time = time.time()
        ds_window.load()
        end_time = time.time()
        print(start_date, end_date, f"Time: {(end_time - start_time)/60:.2f} mins")
    
        def process(idx, row):
            huc, gauge_id = row['huc_02'], row.name
            nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
            data = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"), index_col = 0, parse_dates = True)
            for node_idx, node_row in nodes_coords.iterrows():
                lat, lon = node_row['lat'], node_row['lon']
                ds_window_loc = ds_window.sel(lat = lat, lon = lon, method = 'nearest')
                data.loc[start_date:end_date, str(node_idx)] = ds_window_loc.values
            data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"))
            return None
        
        with Parallel(n_jobs = 8, verbose = 0) as parallel:
            _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

        ds_window.close()
        del ds_window
        gc.collect()

    ds_regrided.close()
    del ds, ds_regrided
    gc.collect()

prcp
timesteps: 14965
1980-01-01 1984-12-31 Time: 4.04 mins


100%|██████████| 10/10 [00:00<00:00, 5248.13it/s]


1985-01-01 1989-12-31 Time: 3.98 mins


100%|██████████| 10/10 [00:00<00:00, 6091.94it/s]


1990-01-01 1994-12-31 Time: 4.09 mins


100%|██████████| 10/10 [00:00<00:00, 7449.92it/s]


1995-01-01 1999-12-31 Time: 4.00 mins


100%|██████████| 10/10 [00:00<00:00, 6554.62it/s]


2000-01-01 2004-12-31 Time: 3.89 mins


100%|██████████| 10/10 [00:00<00:00, 4898.17it/s]


2005-01-01 2009-12-31 Time: 4.03 mins


100%|██████████| 10/10 [00:00<00:00, 6153.62it/s]


2010-01-01 2014-12-31 Time: 4.00 mins


100%|██████████| 10/10 [00:00<00:00, 4065.82it/s]


2015-01-01 2019-12-31 Time: 4.13 mins


100%|██████████| 10/10 [00:00<00:00, 7201.76it/s]


2020-01-01 2020-12-31 Time: 0.92 mins


100%|██████████| 10/10 [00:00<00:00, 3048.85it/s]


srad
timesteps: 14965
1980-01-01 1984-12-31 Time: 5.90 mins


100%|██████████| 10/10 [00:00<00:00, 10.80it/s]


1985-01-01 1989-12-31 Time: 6.36 mins


100%|██████████| 10/10 [00:00<00:00, 10.71it/s]


1990-01-01 1994-12-31 Time: 6.59 mins


100%|██████████| 10/10 [00:00<00:00, 11.17it/s]


1995-01-01 1999-12-31 Time: 6.44 mins


100%|██████████| 10/10 [00:01<00:00,  7.23it/s]


2000-01-01 2004-12-31 Time: 5.90 mins


100%|██████████| 10/10 [00:01<00:00,  8.51it/s]


2005-01-01 2009-12-31 Time: 6.11 mins


100%|██████████| 10/10 [00:01<00:00,  7.55it/s]


2010-01-01 2014-12-31 Time: 6.13 mins


100%|██████████| 10/10 [00:01<00:00,  6.58it/s]


2015-01-01 2019-12-31 Time: 6.27 mins


100%|██████████| 10/10 [00:01<00:00,  6.84it/s]


2020-01-01 2020-12-31 Time: 1.21 mins


100%|██████████| 10/10 [00:00<00:00, 2943.58it/s]


swe
timesteps: 14965
1980-01-01 1984-12-31 Time: 3.94 mins


100%|██████████| 10/10 [00:00<00:00, 5519.55it/s]


1985-01-01 1989-12-31 Time: 3.96 mins


100%|██████████| 10/10 [00:00<00:00, 5168.58it/s]


1990-01-01 1994-12-31 Time: 4.02 mins


100%|██████████| 10/10 [00:00<00:00, 6540.31it/s]


1995-01-01 1999-12-31 Time: 3.97 mins


100%|██████████| 10/10 [00:00<00:00, 6561.80it/s]


2000-01-01 2004-12-31 Time: 3.82 mins


100%|██████████| 10/10 [00:00<00:00, 7355.85it/s]


2005-01-01 2009-12-31 Time: 3.94 mins


100%|██████████| 10/10 [00:00<00:00, 6770.47it/s]


2010-01-01 2014-12-31 Time: 3.88 mins


100%|██████████| 10/10 [00:00<00:00, 7180.80it/s]


2015-01-01 2019-12-31 Time: 4.02 mins


100%|██████████| 10/10 [00:00<00:00, 7613.55it/s]


2020-01-01 2020-12-31 Time: 0.90 mins


100%|██████████| 10/10 [00:00<00:00, 3291.97it/s]


tmax
timesteps: 14965
1980-01-01 1984-12-31 Time: 4.85 mins


100%|██████████| 10/10 [00:00<00:00, 6604.16it/s]


1985-01-01 1989-12-31 Time: 4.91 mins


100%|██████████| 10/10 [00:00<00:00, 7124.69it/s]


1990-01-01 1994-12-31 Time: 5.02 mins


100%|██████████| 10/10 [00:01<00:00,  6.97it/s]


1995-01-01 1999-12-31 Time: 4.95 mins


100%|██████████| 10/10 [00:00<00:00, 5913.30it/s]


2000-01-01 2004-12-31 Time: 4.86 mins


100%|██████████| 10/10 [00:00<00:00, 7508.60it/s]


2005-01-01 2009-12-31 Time: 4.98 mins


100%|██████████| 10/10 [00:00<00:00, 8366.85it/s]


2010-01-01 2014-12-31 Time: 4.96 mins


100%|██████████| 10/10 [00:00<00:00, 6493.74it/s]


2015-01-01 2019-12-31 Time: 5.09 mins


100%|██████████| 10/10 [00:00<00:00, 10.97it/s]


2020-01-01 2020-12-31 Time: 1.17 mins


100%|██████████| 10/10 [00:00<00:00, 2820.46it/s]


tmin
timesteps: 14965
1980-01-01 1984-12-31 Time: 4.80 mins


100%|██████████| 10/10 [00:00<00:00, 9004.52it/s]


1985-01-01 1989-12-31 Time: 4.85 mins


100%|██████████| 10/10 [00:00<00:00, 7898.88it/s]


1990-01-01 1994-12-31 Time: 4.91 mins


100%|██████████| 10/10 [00:00<00:00, 7605.27it/s]


1995-01-01 1999-12-31 Time: 4.81 mins


100%|██████████| 10/10 [00:00<00:00, 2174.90it/s]


2000-01-01 2004-12-31 Time: 4.75 mins


100%|██████████| 10/10 [00:00<00:00, 5671.04it/s]


2005-01-01 2009-12-31 Time: 4.87 mins


100%|██████████| 10/10 [00:00<00:00, 7503.23it/s]


2010-01-01 2014-12-31 Time: 4.87 mins


100%|██████████| 10/10 [00:00<00:00, 8509.44it/s]


2015-01-01 2019-12-31 Time: 5.03 mins


100%|██████████| 10/10 [00:00<00:00, 10.17it/s]


2020-01-01 2020-12-31 Time: 1.17 mins


100%|██████████| 10/10 [00:00<00:00, 4404.86it/s]


vp
timesteps: 14965
1980-01-01 1984-12-31 Time: 6.45 mins


100%|██████████| 10/10 [00:01<00:00,  7.04it/s]


1985-01-01 1989-12-31 Time: 7.00 mins


100%|██████████| 10/10 [00:00<00:00, 10.19it/s]


1990-01-01 1994-12-31 Time: 7.15 mins


100%|██████████| 10/10 [00:01<00:00,  9.07it/s]


1995-01-01 1999-12-31 Time: 7.02 mins


100%|██████████| 10/10 [00:01<00:00,  6.94it/s]


2000-01-01 2004-12-31 Time: 6.84 mins


100%|██████████| 10/10 [00:01<00:00,  6.91it/s]


2005-01-01 2009-12-31 Time: 7.02 mins


100%|██████████| 10/10 [00:00<00:00, 10.44it/s]


2010-01-01 2014-12-31 Time: 7.15 mins


100%|██████████| 10/10 [00:01<00:00,  6.72it/s]


2015-01-01 2019-12-31 Time: 7.41 mins


100%|██████████| 10/10 [00:01<00:00,  6.78it/s]


2020-01-01 2020-12-31 Time: 1.28 mins


100%|██████████| 10/10 [00:00<00:00, 2708.27it/s]


dayl
timesteps: 14965
1980-01-01 1984-12-31 Time: 2.71 mins


100%|██████████| 10/10 [00:00<00:00, 6098.14it/s]


1985-01-01 1989-12-31 Time: 2.75 mins


100%|██████████| 10/10 [00:00<00:00, 7246.55it/s]


1990-01-01 1994-12-31 Time: 2.84 mins


100%|██████████| 10/10 [00:00<00:00, 6247.10it/s]


1995-01-01 1999-12-31 Time: 2.77 mins


100%|██████████| 10/10 [00:00<00:00, 7401.28it/s]


2000-01-01 2004-12-31 Time: 2.70 mins


100%|██████████| 10/10 [00:00<00:00, 6800.10it/s]


2005-01-01 2009-12-31 Time: 2.78 mins


100%|██████████| 10/10 [00:00<00:00, 6721.64it/s]


2010-01-01 2014-12-31 Time: 2.70 mins


100%|██████████| 10/10 [00:00<00:00, 2155.35it/s]


2015-01-01 2019-12-31 Time: 2.93 mins


100%|██████████| 10/10 [00:00<00:00, 6183.55it/s]


2020-01-01 2020-12-31 Time: 0.85 mins


100%|██████████| 10/10 [00:00<00:00, 3007.75it/s]


In [14]:
var_names = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'dayl']
for idx, row in tqdm.tqdm(camels_graph.iterrows()):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
    for var_name in var_names:
        data = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"), index_col = 0, parse_dates = True)
        # Fill the NaN values with a window of 15 days centered around the missing value
        for col in data.columns:
            data[col] = data[col].fillna(data[col].rolling(15, min_periods = 1, center = True).mean())
        data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'dynamic', 'Daymet', f"{var_name}.csv"))

10it [00:10,  1.10s/it]


## Terrain Attributes

In [12]:
from shapely.geometry import Polygon
import rioxarray

def coords_to_polygon(lon, lat, resolution):
    half_res = resolution / 2
    return Polygon([
        (round(lon - half_res,3), round(lat - half_res,3)),
        (round(lon - half_res,3), round(lat + half_res,3)),
        (round(lon + half_res,3), round(lat + half_res,3)),
        (round(lon + half_res,3), round(lat - half_res,3))
    ])
def tile_filename_to_coords(filename):
    # format: n/s{dd}e/w{ddd}_elv.tif
    # n/e: positive, s/w: negative
    n_s, lat, e_w, lon = filename[0], int(filename[1:3]), filename[3], int(filename[4:7])
    lat = lat if n_s == 'n' else -lat
    lon = lon if e_w == 'e' else -lon
    return (lon, lat)

In [13]:
import itertools
var_names = ['elv', 'slope_percentage', 'slope_riserun', 'slope_degrees', 'slope_radians', 'aspect', 'curvature', 'planform_curvature', 'profile_curvature', 'upa', 'wth']
# valid_tiles = ['n30w150', 'n30w120', 'n30w090']

issues = []
for var_name in itertools.islice(var_names,0,None,1):
    print(var_name)
    tiles_paths = sorted(glob.glob(os.path.join(PATHS['MERIT-Hydro'], var_name, '**', '*.tif'), recursive=True))
    # tiles_paths = [tile for tile in tiles_paths if os.path.basename(os.path.dirname(tile)).split('_')[-1] in valid_tiles]
    tiles_filenames = [os.path.basename(tile) for tile in tiles_paths]
    tiles_names = [tile.split('_')[0] for tile in tiles_filenames]
    tiles_lower_left_corner = [tile_filename_to_coords(tile) for tile in tiles_filenames]
    tiles_polygons = [Polygon([(lon, lat), (lon + 5, lat), (lon + 5, lat + 5), (lon, lat + 5)]) for lon, lat in tiles_lower_left_corner]

    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(SAVE_PATH, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = ['mean', 'std', '25%', '50%', '75%'])
        cell_polygons = [coords_to_polygon(row['lon'], row['lat'], resolution) for _, row in nodes_coords.iterrows()]
        catmt_polygon = cell_polygons[0]
        for polygon in cell_polygons[1:]:
            catmt_polygon = catmt_polygon.union(polygon)
        intersected_tiles = []
        for tile_polygon, tile_path in zip(tiles_polygons, tiles_paths):
            if tile_polygon.intersects(catmt_polygon):
                intersected_tiles.append(tile_path)
        ds = rioxarray.open_rasterio(intersected_tiles[0])
        for tile in intersected_tiles[1:]:
            ds = ds.combine_first(rioxarray.open_rasterio(tile))
        ds = ds.sel(band=1)
        # Sort the x and y coordinates to be ascending
        ds = ds.sortby('x', ascending=True)
        ds = ds.sortby('y', ascending=True)
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            # ds_node = ds.rio.clip_box(lon - resolution/2, lat - resolution/2, lon + resolution/2, lat + resolution/2)
            ds_node = ds.sel(x = slice(lon - resolution/2, lon + resolution/2), y = slice(lat - resolution/2, lat + resolution/2))
            ds_node = ds_node.where(ds_node != ds.rio.nodata)
            ds_node_values = ds_node.values.flatten()
            mean = np.nanmean(ds_node_values)
            std = np.nanstd(ds_node_values)
            q25 = np.nanquantile(ds_node_values, 0.25)
            q50 = np.nanquantile(ds_node_values, 0.50)
            q75 = np.nanquantile(ds_node_values, 0.75)
            data.loc['mean', node_idx] = mean
            data.loc['std', node_idx] = std
            data.loc['25%', node_idx] = q25
            data.loc['50%', node_idx] = q50
            data.loc['75%', node_idx] = q75
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static'), exist_ok = True)
        os.makedirs(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'MERIT-Hydro'), exist_ok = True)
        data.to_csv(os.path.join(SAVE_PATH, "graph_features", huc, gauge_id, 'static', 'MERIT-Hydro', f"{var_name}.csv"))

        ds.close()
        del ds
        gc.collect()

    for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)):
        try:
            process(idx, row)
        except Exception as e:
            issues.append(f"{var_name}-{row['huc_02']}-{row.name}")
            print(f"Error: {var_name}-{row['huc_02']}-{row.name}. {e}")

elv


100%|██████████| 10/10 [00:25<00:00,  2.58s/it]


slope_percentage


100%|██████████| 10/10 [00:20<00:00,  2.08s/it]


slope_riserun


100%|██████████| 10/10 [00:19<00:00,  1.94s/it]


slope_degrees


100%|██████████| 10/10 [00:20<00:00,  2.01s/it]


slope_radians


100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


aspect


100%|██████████| 10/10 [00:20<00:00,  2.02s/it]


curvature


100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


planform_curvature


100%|██████████| 10/10 [00:19<00:00,  1.91s/it]


profile_curvature


100%|██████████| 10/10 [00:20<00:00,  2.09s/it]


upa


100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


wth


100%|██████████| 10/10 [00:13<00:00,  1.39s/it]


In [14]:
len(issues)

0

In [15]:
issues_df = [entry.split('-') for entry in issues]
issues_df = pd.DataFrame(issues_df, columns = ['var_name', 'huc_02', 'gauge_id'])
issues_df

Unnamed: 0,var_name,huc_02,gauge_id


In [16]:
issues_df[issues_df['var_name'] == 'elv']

Unnamed: 0,var_name,huc_02,gauge_id


## Spatial Encodings

In [13]:
def process(idx, row):
    # lon: -180 to 180; lat: -60 to 90
    lon_transform = lambda x: np.sin(2 * np.pi * (x+180) / 360)
    lat_transform = lambda x: (x - (-60))/(90 - (-60))

    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)

    data = pd.DataFrame(columns = nodes_coords.index, index = ['lon_transformed', 'lat_transformed'])
    for node_idx, node_row in nodes_coords.iterrows():
        lat, lon = node_row['lat'], node_row['lon']
        data.loc['lon_transformed', node_idx] = lon_transform(lon)
        data.loc['lat_transformed', node_idx] = lat_transform(lat)

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'spatial_encodings.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 129.23it/s]


## uparea

In [14]:
uparea = xr.open_dataset(os.path.join(PATHS['gis_ldd'], 'CWatM_30min/upstream_area_km2.nc'))
ds_varname = list(uparea.data_vars)[0]
uparea = uparea[ds_varname]
uparea = uparea.sel(
    lat = slice(region_bounds['maxy'], region_bounds['miny']), 
    lon = slice(region_bounds['minx'], region_bounds['maxx'])
)
uparea.load()

def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)

    data = pd.DataFrame(columns = nodes_coords.index, index = [0])
    for node_idx, node_row in nodes_coords.iterrows():
        lat, lon = node_row['lat'], node_row['lon']
        data.loc[0, node_idx] = uparea.sel(lat = lat, lon = lon, method = 'nearest').values.item()

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'uparea.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

uparea.close()
del uparea
gc.collect()

100%|██████████| 10/10 [00:00<00:00, 1497.32it/s]


897

## USGS

In [15]:
def process(idx, row):
    huc, gauge_id = row['huc_02'], row.name
    uparea = row['area_geospa_fabric']

    usgs_filepath = os.path.join(PATHS['USGS'], 'CAMELS-US', huc, 'csv', f'{gauge_id}.csv')
    usgs_data = pd.read_csv(usgs_filepath, index_col = 0, parse_dates = True)
    usgs_data.columns = ['Q_ft3s']
    usgs_data['Q_mm'] = ((usgs_data['Q_ft3s'] / (3.28084**3)) / (uparea * 1e6)) * (3600*24*1000)

    os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id), exist_ok = True)
    usgs_data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, f'USGS.csv'))

with Parallel(n_jobs = 8, verbose = 0) as parallel:
    _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

100%|██████████| 10/10 [00:00<00:00, 6824.45it/s]


## GloFAS Parameter Maps

In [16]:
# "Catchment_morphology_and_river_network" (14 surface fields)
# - chanbnkf_Global_03min.nc (channel bankfull depth, m);
# - chanflpn_Global_03min.nc (width of the floodplain, m);
# - changrad_Global_03min.nc (channel longitudinal gradient, m/m);
# - chanlength_Global_03min.nc (channel length within a pixel, m);
# - chanman_Global_03min.nc (channel Manning's roughness coefficient, m^(1/3)s^(-1));
# - chans_Global_03min.nc (channel side slope, m/m);
# - chanbw_Global_03min.nc (channel bottom width, m):

# "Land_use" (7 surface fields)
# - fracforest_Global_03min.nc (fraction of forest for each grid-cell, -);
# - fracirrigated_Global_03min.nc (fraction of irrigated crops [except rice] for each grid-cell, -);
# - fracrice_Global_03min.nc (fraction of rice crops for each grid-cell, -);
# - fracsealed_Global_03min.nc (fraction of urban area for each grid-cell, -);
# - fracwater_Global_03min.nc (fraction of inland water for each grid-cell, -);
# - fracother_Global_03min.nc (fraction of other land cover for each grid-cell, -);
Parameter_Maps = os.path.join(PATHS['GloFAS'], 'LISFLOOD_Parameter_Maps')

var_names = ['chanbnkf', 'chanflpn', 'changrad', 'chanlength', 'chanman', 'chans', 'chanbw']
for var_name in var_names:
    print(var_name)
    ds = xr.open_dataset(os.path.join(Parameter_Maps, 'Catchments_morphology_and_river_network', f"{var_name}_Global_03min.nc"))['Band1']
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    ds.load()

    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            # ds_window_loc = ds.sel(lat = lat, lon = lon, method = 'nearest')
            ds_window_loc = ds.sel(
                lat = slice(lat + 0.5*resolution, lat - 0.5*resolution),
                lon = slice(lon - 0.5*resolution, lon + 0.5*resolution)
            ).mean()
            data.loc[0, node_idx] = ds_window_loc.values.item()
        os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS'), exist_ok = True)
        data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

    ds.close()
    del ds
    gc.collect()

var_names = ['fracforest', 'fracirrigated', 'fracrice', 'fracsealed', 'fracwater', 'fracother']
for var_name in var_names:
    print(var_name)
    ds = xr.open_dataset(os.path.join(Parameter_Maps, 'Land_use', f"{var_name}_Global_03min.nc"))['Band1']
    ds = ds.sel(
        lat = slice(region_bounds['maxy'], region_bounds['miny']), 
        lon = slice(region_bounds['minx'], region_bounds['maxx'])
    )
    ds.load()

    def process(idx, row):
        huc, gauge_id = row['huc_02'], row.name
        nodes_coords = pd.read_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_files', huc, gauge_id, 'nodes_coords.csv'), index_col = 0)
        data = pd.DataFrame(columns = nodes_coords.index, index = [0])
        for node_idx, node_row in nodes_coords.iterrows():
            lat, lon = node_row['lat'], node_row['lon']
            # ds_window_loc = ds.sel(lat = lat, lon = lon, method = 'nearest')
            ds_window_loc = ds.sel(
                lat = slice(lat + 0.5*resolution, lat - 0.5*resolution),
                lon = slice(lon - 0.5*resolution, lon + 0.5*resolution)
            ).mean()
            data.loc[0, node_idx] = ds_window_loc.values.item()
        os.makedirs(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS'), exist_ok = True)
        data.to_csv(os.path.join(PATHS['devp_datasets'], DIRNAME, 'graph_features', huc, gauge_id, 'static', 'GloFAS', f"{var_name}.csv"))

    with Parallel(n_jobs = 8, verbose = 0) as parallel:
        _ = parallel(delayed(process)(idx, row) for idx, row in tqdm.tqdm(camels_graph.iterrows(), total=len(camels_graph)))

chanbnkf


100%|██████████| 10/10 [00:00<00:00, 3291.46it/s]


chanflpn


100%|██████████| 10/10 [00:00<00:00, 4145.39it/s]


changrad


100%|██████████| 10/10 [00:00<00:00, 9915.61it/s]


chanlength


100%|██████████| 10/10 [00:00<00:00, 11241.77it/s]


chanman


100%|██████████| 10/10 [00:00<00:00, 11072.61it/s]


chans


100%|██████████| 10/10 [00:00<00:00, 9560.76it/s]


chanbw


100%|██████████| 10/10 [00:00<00:00, 3739.57it/s]


fracforest


100%|██████████| 10/10 [00:00<00:00, 8950.71it/s]


fracirrigated


100%|██████████| 10/10 [00:00<00:00, 10116.51it/s]


fracrice


100%|██████████| 10/10 [00:00<00:00, 9058.97it/s]


fracsealed


100%|██████████| 10/10 [00:00<00:00, 9474.37it/s]


fracwater


100%|██████████| 10/10 [00:00<00:00, 11187.79it/s]


fracother


100%|██████████| 10/10 [00:00<00:00, 11428.62it/s]
