In [3]:
import pandas as pd
import numpy as np

import tqdm.notebook as tq

import fiona
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon

import glob
import os
import multiprocessing as mp

In [4]:
# set path to shape file
# shape_file_path = '/mnt/d/education/HSI/aspirantura/CAMELS_ru/files/openf_gauges_watersheds/watersheds_openf.shp'
shape_file_path = '/home/dima/Documents/education/HSI/aspirantura/Dissertation/conus_data/basin_set_full_res/HCDN_nhru_final_671.shp'
# set path to downloaded HydroATLAS
gdb_file_path = '/home/dima/Documents/education/HSI/aspirantura/Dissertation/files/BasinATLAS/BasinATLAS_v10.gdb/'
# set path where results will be stored
path_to_save = '/home/dima/Documents/education/HSI/aspirantura/Dissertation/conus_data/featureXtractor'

In [5]:
# Read shape file with geometry column
big_shape = gpd.read_file(shape_file_path)
big_shape = big_shape[['hru_id', 'geometry', 'AREA']]

# rename column of gauge identification number to capital ID
big_shape = big_shape.rename(columns={"hru_id": "gauge_id"})
big_shape['gauge_id'] = ['0'+i if len(i) != 8
                         else i 
                         for i in map(str, big_shape['gauge_id']) ]
big_shape

Unnamed: 0,gauge_id,geometry,AREA
0,01013500,"MULTIPOLYGON (((-68.35650 46.90311, -68.35612 ...",2.303988e+09
1,01022500,"POLYGON ((-67.97836 44.61310, -67.97800 44.613...",6.203873e+08
2,01030500,"MULTIPOLYGON (((-67.83991 45.36614, -67.83955 ...",3.676155e+09
3,01031500,"MULTIPOLYGON (((-69.33810 45.12317, -69.33800 ...",7.665447e+08
4,01047000,"POLYGON ((-70.10847 45.21669, -70.10858 45.216...",9.049562e+08
...,...,...,...
666,14309500,"POLYGON ((-123.81322 42.89103, -123.81312 42.8...",2.263143e+08
667,14316700,"POLYGON ((-122.49936 43.47688, -122.49972 43.4...",5.880250e+08
668,14325000,"POLYGON ((-124.07751 42.89822, -124.07716 42.8...",4.449257e+08
669,14362250,"POLYGON ((-123.15128 42.19624, -123.15118 42.1...",4.387790e+07


#### Main functions

In [26]:
def polygon_area(lats: list, lons: list, radius = 6378137):
    """
    Computes area of spherical polygon, assuming spherical Earth. 
    Returns result in ratio of the sphere's area if the radius is specified.
    Otherwise, in the units of provided radius.
    lats and lons are in degrees.

    Args:
        lats (list): [description]
        lons (list): [description]
        radius (int, optional): [description]. Defaults to 6378137.

    Returns:
        [type]: [description]
    """
    from numpy import arctan2, cos, sin, sqrt, pi, power, append, diff, deg2rad
    lats, lons = np.deg2rad(lats), np.deg2rad(lons)

    # Line integral based on Green's Theorem, assumes spherical Earth

    #close polygon
    if lats[0]!=lats[-1]:
        lats=append(lats, lats[0])
        lons=append(lons, lons[0])

    #colatitudes relative to (0,0)
    a = sin(lats/2)**2 + cos(lats)* sin(lons/2)**2
    colat = 2*arctan2( sqrt(a), sqrt(1-a) )

    #azimuths relative to (0,0)
    az = arctan2(cos(lats) * sin(lons), sin(lats)) % (2*pi)

    # Calculate diffs
    # daz = diff(az) % (2*pi)
    daz = diff(az)
    daz = (daz + pi) % (2 * pi) - pi

    deltas=diff(colat)/2
    colat=colat[0:-1]+deltas

    # Perform integral
    integrands = (1-cos(colat)) * daz

    # Integrate 
    area = abs(sum(integrands))/(4*pi)

    area = min(area,1-area)
    if radius is not None: #return in units of radius
        return area * 4 * pi* radius**2 / 10**6
    else: #return in ratio of sphere total area
        return area / 10**6
    
def select_big_from_MP(WS_geometry):
    """[summary]

    Args:
        WS_geometry ([type]): [description]

    Returns:
        [type]: [description]
    """
    if type(WS_geometry) == MultiPolygon:
        big_area = [polygon_area(lats = polygon.exterior.coords.xy[1],
                                 lons = polygon.exterior.coords.xy[0]) 
                    for polygon in WS_geometry]
        WS_geometry = WS_geometry[np.argmax(big_area)]
    else:
        WS_geometry = WS_geometry
    return WS_geometry

### Попытка считать площадь каждого класса

In [14]:
dict_for_class = dict()

for column in gdf.columns:
    if 'cl' in column.split('_'):
        for key, value in dict(gdf.groupby(
            [column])['weight_area'].sum()).items():
            dict_for_class[column+'_'+str(key)] = [value]

pd.DataFrame.from_dict(dict_for_class)/find_POLY_area(gdf_your_WS)*100

Unnamed: 0,clz_cl_smj_10,cls_cl_smj_49,cls_cl_smj_53,glc_cl_smj_4,pnv_cl_smj_4,wet_cl_smj_-9999,tbi_cl_smj_5,tec_cl_smj_351,fmh_cl_smj_5,fmh_cl_smj_7,fec_cl_smj_103,fec_cl_smj_121,lit_cl_smj_2,lit_cl_smj_3,lit_cl_smj_11
0,99.554313,0.011651,99.542662,99.554313,99.554313,99.554313,99.554313,99.554313,99.31084,0.243472,99.31084,0.243472,99.166224,0.222338,0.16575


In [30]:
pd.read_csv('/home/dima/Documents/education/HSI/aspirantura/Dissertation/conus_data/EA_LSTM/featureXtractor/camels_hydro.csv',
            sep=';')['lka_pc_use']

210.0

In [39]:
output = dict()

for i in range(len(big_shape)):
    output[big_shape.loc[
        i, 'gauge_id']] = featureXtractor(user_ws=big_shape.loc[
            i, 'geometry'],
            gdb_file_path=gdb_file_path)

In [None]:
def save_geo_files_to_disk(shape_with_data, list_of_values, path_to_save):
    
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    
    bool_array = list()
    
    for i in range(len(list_of_values)):
        if type(list_of_values[i][0]) == float:
            bool_array.append(False)
        else:
            bool_array.append(True)
            
    VALID_ID = [ID for i, ID in enumerate(shape_with_data.gauge_id) if bool_array[i]]

    VALID_HYDRO = [hydro[0][0] for i, hydro in enumerate(list_of_values) if bool_array[i]]
    hydro_df = pd.concat(VALID_HYDRO).dropna().reset_index(drop = True)
    hydro_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    hydro_df.to_csv('{}/camels_hydro.csv'.format(path_to_save), index = False, sep=';')

    VALID_PHYSIO = [physio[0][1] for i, physio in enumerate(list_of_values) if bool_array[i]]
    physio_df = pd.concat(VALID_PHYSIO).dropna().reset_index(drop = True)
    physio_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    physio_df.to_csv('{}/camels_physio.csv'.format(path_to_save), index = False, sep=';')

    VALID_CLIMATE = [climate[0][2] for i, climate in enumerate(list_of_values) if bool_array[i]]
    climate_df = pd.concat(VALID_CLIMATE).dropna().reset_index(drop = True)
    climate_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    climate_df.to_csv('{}/camels_climate.csv'.format(path_to_save), index = False, sep=';')

    VALID_URBAN = [urban[0][5] for i, urban in enumerate(list_of_values) if bool_array[i]]
    urban_df = pd.concat(VALID_URBAN).dropna().reset_index(drop = True)
    urban_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    urban_df.to_csv('{}/camels_urban.csv'.format(path_to_save), index = False, sep=';')
    #because of big empty sets of WS Landcover is seleceting by indexes of urban values which are verified
    VALID_LANDCOVER = [landcover[0][3] for i, landcover in enumerate(list_of_values) if bool_array[i]] 
    landcover_df = pd.concat(VALID_LANDCOVER).dropna(thresh = 5).reset_index(drop = True)
    landcover_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    landcover_df.to_csv('{}/camels_landcover.csv'.format(path_to_save), index = False, sep=';')

    VALID_SOIL_GEO = [soil_geo[0][4] for i, soil_geo in enumerate(list_of_values) if bool_array[i]]
    soil_geo_df = pd.concat(VALID_SOIL_GEO).dropna().reset_index(drop = True)
    soil_geo_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    soil_geo_df.to_csv('{}/camels_soil_geo.csv'.format(path_to_save), index = False, sep=';')

    VALID_GEOM_HydroATLAS = [geometry[1] for i, geometry in enumerate(list_of_values) if bool_array[i]]
    geometry_df = gpd.GeoDataFrame(VALID_GEOM_HydroATLAS,
                                   columns={'geometry'}).reset_index(drop = True)
    geometry_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)

    geometry_df.to_csv('{}/geometry_HydroATLAS_subB.csv'.format(path_to_save), index = False, sep=';')

In [4]:
def filter_HydroATLAS_sub_basins(WS_own, HydroATLAS_data):

    """

    WS_own - Watershed from your GDF of watersheds
    HydroATLAS_data - gdf file from layers of geodatabase

    """

    intersected_sub_basins = list()

    def select_big_from_MP(WS_geometry):
        if type(WS_geometry) == MultiPolygon:
            big_area = [polygon_area(lats = polygon.exterior.coords.xy[1], 
                                    lons = polygon.exterior.coords.xy[0]) 
                        for polygon in WS_geometry]
            import numpy as np
            WS_geometry = WS_geometry[np.argmax(big_area)]
        else:
            WS_geometry = WS_geometry
        return WS_geometry

    gdf_your_WS = select_big_from_MP(WS_own)
    ### WS from your data
    gdf_your_WS = gpd.GeoSeries([gdf_your_WS])

    ### Create extra gdf to use geopandas functions
    gdf_your_WS = gpd.GeoDataFrame({'geometry': gdf_your_WS})
    gdf_your_WS = gdf_your_WS.set_crs('EPSG:4326')

    for HydroATLAS_row in range(len(HydroATLAS_data)):

        # selection from sub-basins of GeoDataBase
        HydroATLAS_WS = gpd.GeoSeries(select_big_from_MP(HydroATLAS_data.geometry[HydroATLAS_row]))        

        gdf_HydroATLAS_WS = gpd.GeoDataFrame({'geometry': HydroATLAS_WS}).set_crs('EPSG:4326')

        #intersect basins
        res_intersection = gpd.overlay(gdf_your_WS, gdf_HydroATLAS_WS, how='intersection')

        """
        Check if our intersection between sub-basin form HydroAtlas and our watershed is more than 0.6 of 
        sub-basin itself
        If not - than pass        
        """
        if len(res_intersection) != 0:
            res_intersection = select_big_from_MP(res_intersection.geometry[0])

            if polygon_area(lats = res_intersection.exterior.coords.xy[1], 
                            lons = res_intersection.exterior.coords.xy[0])/(
                polygon_area(lats = gdf_HydroATLAS_WS.geometry[0].exterior.coords.xy[1],
                             lons = gdf_HydroATLAS_WS.geometry[0].exterior.coords.xy[0])) > 0.2:
                    

                    intersected_sub_basins.append(HydroATLAS_data.loc[HydroATLAS_row])
        else:
            pass

    return intersected_sub_basins


def split_by_categories(df_ecm, df_me, df_mo):
    
    # basic numbers for different variables
    monthes = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    land_cover_classes = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']
    natural_vegetation = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15']
    wetland_classes = ['01', '02', '03', '04', '05', '06', '07', '08', '09']
    
    hydrology_variables = [item for sublist in [['inu_pc_ult'], ['lka_pc_use'], ['lkv_mc_usu'],
                                            ['rev_mc_usu'], ['dor_pc_pva'], ['gwt_cm_sav']]
                    for item in sublist]

    physiography_variables = [item for sublist in [['ele_mt_sav'], ['slp_dg_sav'], ['sgr_dk_sav']] 
                            for item in sublist]

    climate_variables = [item for sublist in [['clz_cl_smj'], ['cls_cl_smj'], ['tmp_dc_s{}'.format(i) for i in monthes], 
                                            ['pre_mm_s{}'.format(i) for i in monthes], ['pet_mm_s{}'.format(i) for i in monthes],
                                            ['aet_mm_s{}'.format(i) for i in monthes], ['ari_ix_sav'],
                                            ['cmi_ix_s{}'.format(i) for i in monthes], ['snw_pc_s{}'.format(i) for i in monthes]] 
                        for item in sublist]

    landcover_variables = [item for sublist in [['glc_cl_smj'], ['glc_pc_s{}'.format(i) for i in land_cover_classes], 
                                                ['pnv_cl_smj'], ['wet_cl_smj'], ['wet_pc_s{}'.format(i) for i in wetland_classes],
                                                ['for_pc_sse'], ['crp_pc_sse'], ['pst_pc_sse'], 
                                                ['ire_pc_sse'], ['gla_pc_sse'], ['prm_pc_sse'], 
                                                ['tbi_cl_smj'], ['tec_cl_smj']]
                        for item in sublist]

    soil_and_geo_variables = [item for sublist in [['cly_pc_sav'], ['slt_pc_sav'], ['snd_pc_sav'], 
                                                ['soc_th_sav'], ['swc_pc_syr'], ['swc_pc_s{}'.format(i) for i in monthes],
                                                ['lit_cl_smj'], ['kar_pc_sse'], ['ero_kh_sav']]
                            for item in sublist]

    urban_variables = [item for sublist in [['urb_pc_sse'], ['hft_ix_s93'], ['hft_ix_s09']] for item in sublist]

    # dataframe of hydrology variables
    df_HYDRO = pd.concat([
                            df_ecm[
                            df_ecm.columns[
                            [True if i in hydrology_variables else False for i in df_ecm.columns]
                                                    ]],
                            df_me[
                            df_me.columns[
                            [True if i in hydrology_variables else False for i in df_me.columns]
                                                    ]],
                            df_mo[
                            df_mo.columns[
                            [True if i in hydrology_variables else False for i in df_mo.columns]
                                                    ]]
                            ], axis = 1)
    # dataframe of physiography variables
    df_PHYSIO = pd.concat([
                            df_ecm[
                            df_ecm.columns[
                            [True if i in physiography_variables else False for i in df_ecm.columns]
                                                    ]],
                            df_me[
                            df_me.columns[
                            [True if i in physiography_variables else False for i in df_me.columns]
                                                    ]],
                            df_mo[
                            df_mo.columns[
                            [True if i in physiography_variables else False for i in df_mo.columns]
                                                    ]]
                            ], axis = 1)

    # dataframe of climate variables
    df_CLIMATE = pd.concat([
                            df_ecm[
                            df_ecm.columns[
                            [True if i in climate_variables else False for i in df_ecm.columns]
                                                    ]],
                            df_me[
                            df_me.columns[
                            [True if i in climate_variables else False for i in df_me.columns]
                                                    ]],
                            df_mo[
                            df_mo.columns[
                            [True if i in climate_variables else False for i in df_mo.columns]
                                                    ]]
                            ], axis = 1)
    # dataframe of physiography variables                       
    df_LANDCOVER = pd.concat([
                            df_ecm[
                            df_ecm.columns[
                            [True if i in landcover_variables else False for i in df_ecm.columns]
                                                    ]],
                            df_me[
                            df_me.columns[
                            [True if i in landcover_variables else False for i in df_me.columns]
                                                    ]],
                            df_mo[
                            df_mo.columns[
                            [True if i in landcover_variables else False for i in df_mo.columns]
                                                    ]]
                            ], axis = 1)
    # dataframe of soil and geology variables
    df_SOIL_GEO = pd.concat([
                            df_ecm[
                            df_ecm.columns[
                            [True if i in soil_and_geo_variables else False for i in df_ecm.columns]
                                                    ]],
                            df_me[
                            df_me.columns[
                            [True if i in soil_and_geo_variables else False for i in df_me.columns]
                                                    ]],
                            df_mo[
                            df_mo.columns[
                            [True if i in soil_and_geo_variables else False for i in df_mo.columns]
                                                    ]]
                            ], axis = 1)
    # dataframe of urban variables
    df_URBAN = pd.concat([
                            df_ecm[
                            df_ecm.columns[
                            [True if i in urban_variables else False for i in df_ecm.columns]
                                                    ]],
                            df_me[
                            df_me.columns[
                            [True if i in urban_variables else False for i in df_me.columns]
                                                    ]],
                            df_mo[
                            df_mo.columns[
                            [True if i in urban_variables else False for i in df_mo.columns]
                                                    ]]
                            ], axis = 1)
    return [df_HYDRO, df_PHYSIO, df_CLIMATE, df_LANDCOVER, df_SOIL_GEO, df_URBAN]

def get_HydroATLAS_for_WS(WS, WS_index, path_to_HydroATLAS, layers_from_HydroATLAS):

    pd.options.mode.chained_assignment = None

    def select_big_from_MP(WS_geometry):
        if type(WS_geometry) == MultiPolygon:
            big_area = [polygon_area(lats = polygon.exterior.coords.xy[1], 
                                    lons = polygon.exterior.coords.xy[0]) 
                        for polygon in WS_geometry]
            WS_geometry = WS_geometry[np.argmax(big_area)]
        else:
            WS_geometry = WS_geometry
        return WS_geometry
    
    """                             
    WS - Your data with watershed GDF.geometry[:]
    WS_index - Number of the index of WS from GeoDataFrame geometry field
    path_to_HydroATLAS - Path to BasinATLAS_v10.gdb file
    
    """
    
    # basic numbers for different variables
    monthes = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    land_cover_classes = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22']
    natural_vegetation = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15']
    wetland_classes = ['01', '02', '03', '04', '05', '06', '07', '08', '09']
    
    # Get all the layers from the .gdb file 
    layers = layers_from_HydroATLAS
    # -1 layer - high density sub-basins (lowest area)
    
    # Read choosen geodatabase layer with geopandas
    gdf = gpd.read_file(path_to_HydroATLAS, 
                        mask = WS.geometry[WS_index], layer=layers,  ignore_geometry=False)
    
    
    list_of_goodies = filter_HydroATLAS_sub_basins(WS.geometry[WS_index], gdf)
  
    if len(list_of_goodies) != 0:
        list_of_goodies = gpd.GeoDataFrame(pd.DataFrame(list_of_goodies)).set_crs('EPSG:4326').reset_index(drop = True)
        from shapely.ops import unary_union
        union_geometry = gpd.GeoSeries(unary_union([i for i in list_of_goodies.geometry])).set_crs('epsg:4326')
        union_geometry = select_big_from_MP(union_geometry.geometry[0])

        """
        group columns by category (difference is the way of mathematical aggregation)

        e.g. classes will be aggrgated by mode value in sub-basins of the watershed

        other values will be calculated as a mean for selected watershed

        """
        # values which will be aggregated by mean
        columns_MEAN = [['inu_pc_ult'], ['lka_pc_use'], ['lkv_mc_usu'], ['rev_mc_usu'], ['dor_pc_pva'], ['gwt_cm_sav'], ['ele_mt_sav'], ['slp_dg_sav'],
                ['sgr_dk_sav'], ['tmp_dc_s{}'.format(i) for i in monthes], ['pre_mm_s{}'.format(i) for i in monthes], 
                ['pet_mm_s{}'.format(i) for i in monthes], ['aet_mm_s{}'.format(i) for i in monthes], ['snw_pc_s{}'.format(i) for i in monthes], 
                ['glc_pc_s{}'.format(i) for i in land_cover_classes], ['pnv_pc_s{}'.format(i) for i in natural_vegetation], ['wet_pc_s{}'.format(i) for i in wetland_classes], 
                ['for_pc_sse'], ['crp_pc_sse'], ['pst_pc_sse'], ['ire_pc_sse'], ['gla_pc_sse'], ['prm_pc_sse'], ['cly_pc_sav'], ['slt_pc_sav'], 
                ['snd_pc_sav'], ['soc_th_sav'], ['swc_pc_syr'], ['swc_pc_s{}'.format(i) for i in monthes], ['kar_pc_sse'], ['ero_kh_sav'], ['urb_pc_sse']]

        # values which will be aggregated by mode
        columns_MODE = [['clz_cl_smj'], ['cls_cl_smj'], ['glc_cl_smj'], ['pnv_cl_smj'],
                        ['wet_cl_smj'], ['tbi_cl_smj'], ['tec_cl_smj'], ['lit_cl_smj']]

        # values which will be aggregated by mean but need extra calculations
        # e.g. ari_ix need to be divided by 10, cmi by 100 etc.
        # for some reason values exceed treshold of the range
        columns_EXTRA_CALC_MEAN = [['ari_ix_sav'], ['cmi_ix_s{}'.format(i) for i in monthes], ['hft_ix_s93'], ['hft_ix_s09']]

        # split list of lists to needed columns
        columns_EXTRA_CALC_MEAN = [item for sublist in columns_EXTRA_CALC_MEAN for item in sublist]
        columns_MEAN = [item for sublist in columns_MEAN for item in sublist]
        columns_MODE = [item for sublist in columns_MODE for item in sublist]
        
        # dataframe for indexes

        df_EXTRA_CALC_MEAN = list_of_goodies[columns_EXTRA_CALC_MEAN]
        df_EXTRA_CALC_MEAN.loc[:, ['ari_ix_sav']] /= 10 # aridity index is the value between 0 and 100. In current version of HydroATLAS (v 1.0) it's vary between 0 and 1000
        
        df_EXTRA_CALC_MEAN.loc[:, ['cmi_ix_s{}'.format(i) for i in monthes]] /= 100 # aridity index is the value between -1 and 1. In current version of HydroATLAS (v 1.0) it's vary between -100 and 100

        df_EXTRA_CALC_MEAN = df_EXTRA_CALC_MEAN.mean()
        
        # dataframe for area values

        df_MEAN = list_of_goodies[columns_MEAN]
        df_MEAN.loc[:, ['tmp_dc_s{}'.format(i) for i in monthes]] /= 10 # in some regions on North-West Russia average value for Jan -83. I assume it's need to be divide by 10
        df_MEAN = df_MEAN.mean()
        

        #dataframe for classes

        df_MODE = list_of_goodies[columns_MODE]
        df_MODE = df_MODE.replace(-9999, np.NaN) # Это вопрос: может стоит оставить "отсутствующий класс" "мокрых земель"
        df_MODE = df_MODE.mode()
        
        
        list_of_frames = [df_EXTRA_CALC_MEAN, df_MEAN, df_MODE]
        
        for i in range(len(list_of_frames)):
            if type(list_of_frames[i]) == pd.Series:
                list_of_frames[i] = list_of_frames[i].to_frame().T
            else:
                pass
        
        fin = split_by_categories(df_ecm = list_of_frames[0], df_me = list_of_frames[1], df_mo = list_of_frames[2])
    
    else:
        list_of_goodies = np.NaN
        union_geometry = np.NaN
        fin = np.NaN 
    
    
    return fin, union_geometry, list_of_goodies

In [5]:
def parallelize_function(WS, path_to_HydroATLAS, layer_small):
    
    """
    This function generate list of tuples
    where each tuple stands for row in DF
    of watersheds
    WS - GeoDataFrame of WS
    path_to_HydroATLAS - path to BasinATLAS gdb
    layer_small - fiona layer of smallest grid of WS    
    """
    mp_tuples = list()
    path_to_HydroATLAS = path_to_HydroATLAS
    
    for row in range(len(WS)):
        mp_tuples.append((WS,
                          row,
                          path_to_HydroATLAS,
                          layer_small))
    
    return mp_tuples

In [6]:
# get layers of smallest scale sub-basins
layer_small = fiona.listlayers(gdb_file_path)[-1]
# WS, WS_index, path_to_HydroATLAS, layers_from_HydroATLAS
data = parallelize_function(WS = big_shape,
                            path_to_HydroATLAS = gdb_file_path,
                            layer_small=layer_small)

In [7]:
# prepare data and iterations on test example

# get count of cpu. Substract by 2 to not to overwhelm system
function_processors = mp.cpu_count()//2

process_pool = mp.Pool(function_processors)
# WS, WS_index, path_to_HydroATLAS, layers_from_HydroATLAS
output = process_pool.starmap(get_HydroATLAS_for_WS, tq.tqdm(data))
process_pool.close()
process_pool.join()

  0%|          | 0/671 [00:00<?, ?it/s]

In [14]:
def save_geo_files_to_disk(shape_with_data, list_of_values, path_to_save):
    
    """
    This function translate resulted list
    to csv separated by category of variables
    which are they store    
    """
    
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    
    bool_array = list()
    
    for i in range(len(list_of_values)):
        if type(list_of_values[i][0]) == float:
            bool_array.append(False)
        else:
            bool_array.append(True)
            
    VALID_ID = [ID for i, ID in enumerate(shape_with_data.gauge_id) if bool_array[i]]

    VALID_HYDRO = [hydro[0][0] for i, hydro in enumerate(list_of_values) if bool_array[i]]
    hydro_df = pd.concat(VALID_HYDRO).dropna().reset_index(drop = True)
    hydro_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    hydro_df.to_csv('{}/camels_hydro.csv'.format(path_to_save), index = False, sep=';')

    VALID_PHYSIO = [physio[0][1] for i, physio in enumerate(list_of_values) if bool_array[i]]
    physio_df = pd.concat(VALID_PHYSIO).dropna().reset_index(drop = True)
    physio_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    physio_df.to_csv('{}/camels_physio.csv'.format(path_to_save), index = False, sep=';')

    VALID_CLIMATE = [climate[0][2] for i, climate in enumerate(list_of_values) if bool_array[i]]
    climate_df = pd.concat(VALID_CLIMATE).dropna().reset_index(drop = True)
    climate_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    climate_df.to_csv('{}/camels_climate.csv'.format(path_to_save), index = False, sep=';')

    VALID_URBAN = [urban[0][5] for i, urban in enumerate(list_of_values) if bool_array[i]]
    urban_df = pd.concat(VALID_URBAN).dropna().reset_index(drop = True)
    urban_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    urban_df.to_csv('{}/camels_urban.csv'.format(path_to_save), index = False, sep=';')
    #because of big empty sets of WS Landcover is seleceting by indexes of urban values which are verified
    VALID_LANDCOVER = [landcover[0][3] for i, landcover in enumerate(list_of_values) if bool_array[i]] 
    landcover_df = pd.concat(VALID_LANDCOVER).dropna(thresh = 5).reset_index(drop = True)
    landcover_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    landcover_df.to_csv('{}/camels_landcover.csv'.format(path_to_save), index = False, sep=';')

    VALID_SOIL_GEO = [soil_geo[0][4] for i, soil_geo in enumerate(list_of_values) if bool_array[i]]
    soil_geo_df = pd.concat(VALID_SOIL_GEO).dropna().reset_index(drop = True)
    soil_geo_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)
    soil_geo_df.to_csv('{}/camels_soil_geo.csv'.format(path_to_save), index = False, sep=';')

    VALID_GEOM_HydroATLAS = [geometry[1] for i, geometry in enumerate(list_of_values) if bool_array[i]]
    geometry_df = gpd.GeoDataFrame(VALID_GEOM_HydroATLAS,
                                   columns={'geometry'}).reset_index(drop = True)
    geometry_df.insert(loc = 0, column = 'gauge_id', value = VALID_ID)

    geometry_df.to_csv('{}/geometry_HydroATLAS_subB.csv'.format(path_to_save), index = False, sep=';')

In [15]:
save_geo_files_to_disk(shape_with_data=big_shape, 
                       list_of_values=output,
                       path_to_save=path_to_save)