In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from scripts.nc_proc import nc_by_extent
from scripts.geom_proc import (
    create_gdf, poly_from_multipoly, getSquareVertices, polygon_area)
from scripts.loaders import multi_var_nc

import gc
from dask import config as dask_cfg
import numpy as np
import pandas as pd
import glob
from shapely.geometry import Polygon
from pathlib import Path
import geopandas as gpd
import xarray as xr
xr.set_options(display_style="html")
gc.collect()


0

In [4]:
coord_limits = {'max_lon': 179., 'min_lon': 19.,
                'max_lat': 71., 'min_lat': 40.}


russia_basin_path = "./data/ws_21_02_2023_final.gpkg"
russia_ws = gpd.read_file(russia_basin_path)


ws_to_redraw = russia_ws[((russia_ws['db_dif'] > 15) | (russia_ws['db_dif'] < -15))
          & (russia_ws['new_area'] > 100)]
ws_to_redraw = ws_to_redraw.reset_index(drop=True)

final_ws = russia_ws[~((russia_ws['db_dif'] > 15) | (russia_ws['db_dif'] < -15))
          & (russia_ws['new_area'] > 100)].reset_index(drop=True)

In [5]:
final_ws = final_ws[['gauge_id', 'name', 'name_en',
                     'new_area', 'ais_dif', 'db_dif', 'geometry']]
final_ws['geometry'] = [poly_from_multipoly(ws)
                         for ws in final_ws['geometry']]


In [22]:
final_ws.to_file('./data/russia_ws.gpkg')

In [7]:
meteo_path = '/home/anton/dima_experiments/geo_data/meteorology'
era5_land = Path(f'{meteo_path}/era5-land/russia')
era5 = Path(f'{meteo_path}/era5/russia')
imerg = Path(f'{meteo_path}/imerg_year')
gpcp = Path(f'{meteo_path}/gpcp_year')
gleam = Path(f'{meteo_path}/gleam_vars')
mswep = Path(f'{meteo_path}/mswep')

In [119]:
from scripts.grid_calculator import Gridder
import geopandas as gpd

russia_ws = gpd.read_file('./data/russia_ws.gpkg')


ds_description = {
    'era5_land': {'res': 0.05,
                  'f_path': multi_var_nc(era5_land)},
    'era5': {'res': 0.125,
             'f_path': multi_var_nc(era5)},
    'imerg': {'res': 0.05,
              'f_path': multi_var_nc(imerg)},
    'gpcp': {'res': 0.25,
             'f_path': multi_var_nc(gpcp)},
    'gleam': {'res': 0.125,
              'f_path': multi_var_nc(gleam)},
    'mswep': {'res': 0.05,
              'f_path': multi_var_nc(mswep)}}

for i, gauge_id in enumerate(russia_ws['gauge_id']):

    ws_geometry = russia_ws.loc[i, 'geometry']

    for dataset, settings in ds_description.items():

        grid_res = settings['res']

        for variable, pathes in settings['f_path']:

            meteo_grid = Gridder(half_grid_resolution=grid_res,
                                 ws_geom=ws_geometry,
                                 gauge_id=gauge_id,
                                 path_to_save=Path(f'{meteo_path}/great_db'),
                                 nc_pathes=pathes,
                                 var=variable)

            meteo_grid.grid_value_ws()


Unnamed: 0,code,name_ru,name_en,geometry
0,48075,Р.ВОДЛА - Д.ВОДЛА,R.VODLA - D.VODLA,"POLYGON ((37.30708 63.24042, 37.30708 63.24125..."
1,48079,Р.ВОДЛА - Г.ПУДОЖ,R.VODLA - G.PUDOJ,"POLYGON ((37.30708 63.24042, 37.30708 63.24125..."
2,72043,Р.ТОСНА - СТ.ТОСНО,R.TOSNA - ST.TOSNO,"POLYGON ((30.91125 59.55875, 30.91125 59.55625..."
3,72155,Р.ПАША - НИЖЕ Д.ДУБРОВО,R.PASA - NIJE D.DUBROVO,"POLYGON ((34.78625 59.78208, 34.78625 59.78292..."
4,72156,Р.ПАША - С.ЧАСОВЕНСКОЕ,R.PASA - S.CASOVENSKOE,"POLYGON ((34.78625 59.78208, 34.78625 59.78292..."
...,...,...,...,...
825,5072,р.Кульдур - пос.Кульдур,r.Kul_dur - pos.Kul_dur,"POLYGON ((131.77484 49.31712, 131.77484 49.316..."
826,5348,р.Сукпай - мет.ст.Сукпай,r.Sukpaj - met.st.Sukpaj,"POLYGON ((137.10734 47.54129, 137.10650 47.541..."
827,5420,р.Силинка - г.Солнечный,r.Silinka - g.Solnecnyj,"POLYGON ((136.39567 50.69795, 136.39567 50.698..."
828,6329,р.Унаха - с.Унаха,r.Unaha - s.Unaha,"POLYGON ((126.68567 55.67045, 126.68567 55.669..."


In [None]:
from tqdm import tqdm
for dataset, settings in ds_description.items():

    grid_res = settings['res']

    for i, gauge_id in tqdm(enumerate(russia_ws['gauge_id'])):
        print(f'Weighted calculations for {dataset}')
        ws_geometry = russia_ws.loc[i, 'geometry']

        for variable, pathes in settings['f_path'].items():
            pass

In [21]:
test = Gridder(half_grid_resolution=0.125,
               ws_geom=final_ws.loc[0, 'geometry'],
               gauge_id=final_ws.loc[0, 'gauge_id'],
               path_to_save=Path('./delete_me',),
               nc_pathes=[
                   '/home/anton/dima_experiments/geo_data/meteorology/era5-land/russia/total_precipitation/tp_2008.nc'],
               var='tp')


Weights exist with resolution of 0.125


In [19]:
test.grid_value_ws()

In [7]:


}

In [113]:
temp_geom = russia_ws.loc[0, 'geometry']
gauge_id = russia_ws.loc[0, 'code']
path_to_save = Path('/home/anton/dima_experiments/geo_data/meteo_grids')
path_to_save.mkdir(exist_ok=True, parents=True)

# watershed boundaries geometry
ws_shape = create_gdf(temp_geom)
# netcdf with data where value correspond to lat, lon in middle of cell
nc_data = xr.open_mfdataset(dataset_res['era5_land']['f_path']['total_precipitation'])
# resolution of netcdf
grid_res = dataset_res['era5_land']['res']

In [109]:
def grid_weights(nc_data: xr.Dataset,
                 ws_geom: Polygon, gauge_id: str,
                 half_grid_resolution: float,
                 path_to_save: Path):
    """_summary_

    Args:
        nc_data (list): _description_
        ws_geom (Polygon): _description_
        gauge_id (str): _description_
        half_grid_resolution (float): _description_
        path_to_save (Path): _description_
    """
    # watershed boundaries geometry as geodataframe
    ws_gdf = create_gdf(ws_geom)
    
    with dask_cfg.set(**{'array.slicing.split_large_chunks': True}):
        mask_nc = nc_by_extent(nc=nc_data,
                               shape=ws_geom,
                               grid_res=half_grid_resolution)

    # get lat, lon which help define area for intersection
    nc_lat, nc_lon = mask_nc.lat.values, mask_nc.lon.values

    # emulate polygons for grid
    polygons = list()
    for lat in nc_lat:
        for lon in nc_lon:
            # h = 0.125 as a half of ERA5 resolution
            # phi rotation angle
            polygons.append(Polygon(getSquareVertices(mm=(lon, lat),
                                                      h=half_grid_resolution,
                                                      phi=0)))
    # create geodataframe from each polygon from emulation
    polygons = [create_gdf(poly) for poly in polygons]
    # calculate area of watershed to latter comparisons
    ws_area = polygon_area(geo_shape=ws_gdf.loc[0, 'geometry'])
    # find intersection beetween grid cell and actual watershed
    intersected = list()
    for polygon in polygons:
        try:
            intersected.append(gpd.overlay(df1=ws_gdf,
                                           df2=polygon,
                                           how='intersection'))
        except KeyError:
            intersected.append(gpd.GeoDataFrame())
    # find biggest intersection if it returns MultiPolygon instance
    # select biggest Polygon in MultiPolygon
    intersected = [create_gdf(poly_from_multipoly(section.loc[0, 'geometry']))
                   if len(section) != 0
                   else gpd.GeoDataFrame()
                   for section in intersected]
    # create mask for intersection with net_cdf
    inter_mask = np.array([False if section.empty is True
                           else True
                           for section in intersected])

    # shape of initial coordindate size
    grid_shape = (len(nc_lat), len(nc_lon))

    inter_mask = inter_mask.reshape(grid_shape)
    inter_mask = xr.DataArray(data=inter_mask,
                              dims=['lat', 'lon'],
                              coords=[nc_lat, nc_lon])
    # calculate weights of each intersection correspond to net cdf grid
    weights = np.array([0 if section.empty
                        else polygon_area(
                            geo_shape=section.loc[0, 'geometry']) / ws_area
                        for section in intersected])
    weights = weights.reshape(grid_shape)
    # transform to DataArray for calculations
    weights = xr.DataArray(data=weights,
                           dims=['lat', 'lon'])
    weights.name = 'weights'
    weights = weights.where(inter_mask, drop=True)
    weights = weights.fillna(0)

    weight_folder = Path(f'{path_to_save}/weights_{half_grid_resolution}')
    weight_folder.mkdir(exist_ok=True, parents=True)
    weights.to_netcdf(f'{weight_folder}/{gauge_id}.nc')


In [107]:
grid_weights(nc_data=nc_data,
             ws_geom=temp_geom,
             gauge_id=gauge_id,
             half_grid_resolution=grid_res,
             path_to_save=path_to_save)

In [110]:
def grid_value_ws(nc_pathes: list,
                  grid_res: float, weights: xr.DataArray,
                  ws_shape: Polygon, gauge_id: str,
                  var_folder: str, path_to_save: str,
                  aggregation_type: str = 'sum', scale_factor: float = 1.):
    """_summary_

    Args:
        nc_pathes (list): list of pathes to .nc files
        
        grid_res (float): Half of grid degree value
                         to perform weight calculus
                         
        aggregation_type: 
        
        ws_shape (Polygon): Shape of area of interest
        
        gauge_id (str):id for the gauge with corresponded watershed
        
        path_to_save (Path): place where aggregated file will be stored

    Returns:
        _type_: _description_
    """
    if aggregation_type not in ['sum', 'mean']:
        raise Exception(f"Sorry, only sum and mean aggregations are allowed!\
            You insert {aggregation_type}")

    nc_data = xr.open_mfdataset(nc_pathes)
    nc_data = nc_data.sel(time=slice('2008', '2020'))
    # select variable from netcdf
    var = list(nc_data.data_vars)[0]

    # use mask on net_cdf
    with dask_cfg.set(**{'array.slicing.split_large_chunks': True}):
        mask_nc = nc_by_extent(nc=nc_data,
                               shape=ws_shape,
                               grid_res=grid_res)

    inter_mask = weights.astype(bool)
    # create final instersection
    ws_nc = mask_nc.where(inter_mask, drop=True)

    final_save = Path(f'{path_to_save}/{var_folder}/')
    final_save.mkdir(exist_ok=True, parents=True)

    res_df = pd.DataFrame()

    if aggregation_type == 'sum':
        res_df['date'] = ws_nc.time.values
        res_df[var] = ws_nc.weighted(weights=weights).sum(
            dim=['lat', 'lon'])[var].values
        res_df = res_df.set_index('date')
        # res_df.to_csv(f'{final_save}/{gauge_id}.csv')

    else:
        res_df['date'] = ws_nc.time.values
        res_df[var] = ws_nc.weighted(weights=weights).mean(
            dim=['lat', 'lon'])[var].values
        res_df = res_df.set_index('date')
        # res_df.to_csv(f'{final_save}/{gauge_id}.csv')

    gc.collect()

    return res_df


In [115]:
grid_value_ws(nc_pathes=dataset_res['era5_land']['f_path']['total_precipitation'],
              grid_res=grid_res,
              weights=xr.open_dataarray('/home/anton/dima_experiments/geo_data/meteo_grids/weights_0.05/48075.nc'),
              ws_shape=temp_geom, gauge_id=gauge_id,
              var_folder='delete_me',
              path_to_save='./data/')

Unnamed: 0_level_0,tp
date,Unnamed: 1_level_1
2008-01-01,0.002252
2008-01-02,0.000399
2008-01-03,0.000198
2008-01-04,0.000280
2008-01-05,0.000439
...,...
2020-12-27,0.001127
2020-12-28,0.001452
2020-12-29,0.001166
2020-12-30,0.022823


In [None]:
grid_weights(nc_data=nc_data,
             ws_geom=temp_geom,
             gauge_id=gauge_id,
             half_grid_resolution=grid_res,
             path_to_save=path_to_save)