#### This code summarizes information about the GRanD dam dataset
Specifically this code counts number of dams within each level 12 HydroBASIN

In [4]:
#Import needed packages
import geopandas as gpd
from geopandas.tools import sjoin
from timeit import default_timer as timer
import pandas as pd


In [5]:
#HydroBASINS regions to loop through
regions = ['af','ar','as','au','eu','gr','na','sa','si']

#Read GRanD data into geodataframe
grand_file_nm = 'data/var/GRanD_Version_1_3/GRanD_dams_v1_3.shp'
grand_gdf = gpd.read_file(grand_file_nm)

#For each region do a spatial join to link GRanD dams to the basin
appending_data = []
for region in regions:
    start= timer()
    basin_file = f'data/HydroSHEDS/HydroBASINS/basins/hybas_{region}_lev12_v1c/hybas_{region}_lev12_v1c.shp'
    gdf_basins = gpd.read_file(basin_file)
    #Specifying within vs. intersection saves a lot of processing time
    spatial_join_gdf = sjoin(grand_gdf,gdf_basins,how='left',op='within')
    
    #convert to pandas so we can concat regional dataframes (which isn't available in geopandas)
    spatial_join_df = pd.DataFrame(spatial_join_gdf)
    with_data = spatial_join_df.loc[spatial_join_df['HYBAS_ID'].notnull()]
    
    appending_data.append(with_data)
    
    #Just for reference of how long each region takes to process
    proc_time = str(timer()-start) 
    print(f'Seconds taken for {region}: {proc_time}') 
    
#Concatenate spatially joined data from each region into final global dataframe
all_data = pd.concat(appending_data)

Seconds taken for af: 52.21339047500078
Seconds taken for ar: 11.36803035399862
Seconds taken for as: 36.836592231000395
Seconds taken for au: 20.20569632600018
Seconds taken for eu: 32.19158324599994
Seconds taken for gr: 4.369947941002465
Seconds taken for na: 28.387256097001227
Seconds taken for sa: 31.922468537002715
Seconds taken for si: 23.20956921800098


In [7]:
#Select id columns and columns we would like to summarize
summarize_prep = all_data[['PFAF_ID','HYBAS_ID','CAP_MCM']]
#add count field that we can use to count dams per basin
summarize_prep['count'] = 1
#sum number of dams and max storage capacity of reservoirs for each basin with GRanD dams
summarize_df = summarize_prep.groupby(['PFAF_ID','HYBAS_ID']).sum()
summarize_df.reset_index(inplace=True)
summarize_df = summarize_df.rename(columns={'count':'grand_dam_count','CAP_MCM':'grand_cap_mcm_sum','PFAF_ID':'pfaf_12'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [8]:
#Export to csv
outfile_name = 'output/global_hb12_grand_stats.csv'
summarize_df.to_csv(outfile_name, sep=',')

In [10]:
#Test to make sure all dams are accounted for.  Total dam count should equal number of dam records in initial GRanD geodataframe
summarize_df['all']=1
test_count = summarize_df.groupby(['all']).sum()
num_grand_dams, num_col = grand_gdf.shape
summary_count_of_dams = test_count.iloc[0]['grand_dam_count']
if num_grand_dams == summary_count_of_dams:
    print ('All dams accounted for!  YAY!!!')
elif num_grand_dams < summary_count_of_dams:
    num_dif = str(summary_count_of_dams-num_grand_dams)
    print (f'Count is higher than number of dams in GRanD. {num_dif} to many dams in count.')
elif num_grand_dams > summary_count_of_dams:
    num_dif = str(num_grand_dams-summary_count_of_dams)
    print (f'Count is lower than number of dams in GRanD. {num_dif} dams not accounted for.')

All dams accounted for!  YAY!!!


In [None]:
#test to make sure there is a 1:1 relation between HYBAS_ID and PFAF_ID in level 12 basins
for region in regions:
    start= timer()
    basin_file = f'data/HydroSHEDS/HydroBASINS/basins/hybas_{region}_lev12_v1c/hybas_{region}_lev12_v1c.shp'
    gdf_basins = gpd.read_file(basin_file)
    test =gdf_basins[['HYBAS_ID','PFAF_ID']]
    print(any(summarize_df['HYBAS_ID'].duplicated()))