# Dataset Demo

Create plots using the BCUB dataset.

First, we'll showcase the land cover sets by filtering the Vancouver Island basins for a non-overlapping set of basins in the order of $\sim 10 \text{km}^2$, and we'll plot a heat map to show the greatest change in forest cover.  We'll use the parquet format to demonstrate the benefit of having multiple geometry support.

In the `.parquet` file, there are three geometry columns:

* **pour point**: "centroid_geometry"
* **basin polygon**: "basin_geometry"
* **basin centroid**: "geometry"

When importing with GeoPandas, we must specify the active geometry column, and we can switch between active columns by using the `set_geometry()` attribute function.  Neat!

In [1]:
import os
import pandas as pd
import geopandas as gpd

In [2]:
region_code = 'VCI'

In [3]:
BASE_DIR = os.path.dirname(os.getcwd())
BASIN_DIR = os.path.join(BASE_DIR, f'processed_data/derived_basins/{region_code}/')
print(BASIN_DIR)

/home/danbot2/code_5820/large_sample_hydrology/bcub/processed_data/derived_basins/VCI/


In [4]:
df = gpd.read_parquet(os.path.join(BASIN_DIR, f'{region_code}_basins_R0.parquet'))
# df = gpd.read_parquet(os.path.join(foo, f'{region_code}_basin_geometries.parquet'))
print(f'There are {len(df)} basins in the {region_code} parquet file.  The active geometry column at import is "geometry" (basin centroid)')
# reset the active geometry column to the basin polygon
df = df.set_geometry('basin_geometry')
# we need to reproject to EPSG 3857 for plotting
df = df.to_crs(4326)
df.head()

There are 20205 basins in the VCI parquet file.  The active geometry column at import is "geometry" (basin centroid)


Unnamed: 0,ID,drainage_area_km2,ppt_lon_m_3005,ppt_lat_m_3005,ppt_acc,Perimeter_km,id,Elevation_m,Aspect_deg,Slope_deg,region_code,geometry,basin_geometry,centroid_geometry
0,1,3.949954,1201799.0,427554.466512,8003,10.940573,0,66.900002,46,11.1,VCI,POINT (1201798.561 427554.467),"POLYGON ((-123.27636 48.84762, -123.27525 48.8...",POINT (1200247.022 428332.173)
1,2,288.898192,938147.0,610947.838529,586658,124.44186,1,767.0,274,21.4,VCI,POINT (938147.013 610947.839),"POLYGON ((-126.87787 50.51039, -126.87245 50.5...",POINT (947186.158 597192.641)
2,3,156.154631,942030.1,601228.988324,316972,81.185388,2,737.0,270,22.200001,VCI,POINT (942030.115 601228.988),"POLYGON ((-126.81345 50.42395, -126.79732 50.4...",POINT (947798.351 592087.202)
3,4,8.502101,870337.0,567656.795721,17258,13.441895,3,343.0,189,23.5,VCI,POINT (870336.953 567656.796),"POLYGON ((-127.82461 50.1468, -127.82278 50.14...",POINT (869586.486 569892.508)
4,5,2.328269,872733.4,565704.150018,4705,8.308658,4,255.300003,145,23.0,VCI,POINT (872733.382 565704.15),"POLYGON ((-127.78031 50.1188, -127.77857 50.11...",POINT (872524.31 567150.402)


In [5]:
# import the basin attributes for VCI
attribute_path = os.path.join(BASE_DIR, 'processed_data/basin_attributes/', 'BCUB_attributes_20240630.csv')
print(attribute_path)
attributes = pd.read_csv(attribute_path)

/home/danbot2/code_5820/large_sample_hydrology/bcub/processed_data/basin_attributes/BCUB_attributes_20240630.csv


In [6]:
total_basins = 0
region_codes = list(set(attributes['region_code']))
n = 0
for rc in region_codes:
    n += 1
    region_attrs = attributes[attributes['region_code'] == rc].copy()
    nan_rows = region_attrs[region_attrs.isna().any(axis=1)].copy()
    total_basins += len(region_attrs)
    print(f'{n}. {rc} has attributes describing {len(region_attrs)} basins, with {len(nan_rows)} rows having missing values.')
print(f'{total_basins} basins in total.')

1. HGW has attributes describing 5531 basins, with 0 rows having missing values.
2. FRA has attributes describing 167616 basins, with 0 rows having missing values.
3. WWA has attributes describing 43190 basins, with 0 rows having missing values.
4. 08C has attributes describing 38779 basins, with 0 rows having missing values.
5. LRD has attributes describing 164237 basins, with 0 rows having missing values.
6. VCI has attributes describing 20205 basins, with 0 rows having missing values.
7. 08B has attributes describing 48717 basins, with 0 rows having missing values.
8. 08G has attributes describing 23998 basins, with 0 rows having missing values.
9. PCR has attributes describing 105114 basins, with 0 rows having missing values.
10. 10E has attributes describing 70105 basins, with 0 rows having missing values.
11. 08A has attributes describing 39331 basins, with 6 rows having missing values.
12. 08D has attributes describing 19136 basins, with 0 rows having missing values.
13. CLR has

In [7]:
# filter for region_code == 'VCI'
attributes = attributes[attributes['region_code'] == region_code]
attributes.columns

Index(['id', 'region_code', 'drainage_area_km2', 'elevation_m', 'aspect_deg',
       'slope_deg', 'land_use_forest_frac_2010', 'land_use_forest_frac_2015',
       'land_use_forest_frac_2020', 'land_use_shrubs_frac_2010',
       'land_use_shrubs_frac_2015', 'land_use_shrubs_frac_2020',
       'land_use_grass_frac_2010', 'land_use_grass_frac_2015',
       'land_use_grass_frac_2020', 'land_use_wetland_frac_2010',
       'land_use_wetland_frac_2015', 'land_use_wetland_frac_2020',
       'land_use_crops_frac_2010', 'land_use_crops_frac_2015',
       'land_use_crops_frac_2020', 'land_use_urban_frac_2010',
       'land_use_urban_frac_2015', 'land_use_urban_frac_2020',
       'land_use_water_frac_2010', 'land_use_water_frac_2015',
       'land_use_water_frac_2020', 'land_use_snow_ice_frac_2010',
       'land_use_snow_ice_frac_2015', 'land_use_snow_ice_frac_2020',
       'logk_ice_x100', 'k_stdev_x100', 'porosity_x100', 'soil_flag',
       'permafrost_flag', 'prcp', 'tmax', 'tmin', 'vp', 'swe',

In [8]:
attributes['forest_change_2010_to_2020'] = attributes['land_use_forest_frac_2020'] - attributes['land_use_forest_frac_2010']

In [9]:
attributes[['ppt_lat_m_3005', 'ppt_lon_m_3005']] = attributes[['ppt_lat_m_3005', 'ppt_lon_m_3005']].round(0)
attributes.sort_values(['ppt_lon_m_3005', 'ppt_lat_m_3005'], inplace=True)
attributes.tail()

Unnamed: 0,id,region_code,drainage_area_km2,elevation_m,aspect_deg,slope_deg,land_use_forest_frac_2010,land_use_forest_frac_2015,land_use_forest_frac_2020,land_use_shrubs_frac_2010,...,high_prcp_duration,low_prcp_duration,geometry_flag,inside_pct_area_flag,outside_pct_area_flag,centroid_x,centroid_y,ppt_lon_m_3005,ppt_lat_m_3005,forest_change_2010_to_2020
28775,703834,VCI,3,192,134,16.1,84,84,84,5,...,1.017,4.171,0,0,0,1236965.0,409735.340729,1237235.0,408760.0,0
1241834,699045,VCI,5,199,131,15.8,84,85,85,5,...,1.017,4.171,0,0,0,1236775.0,409498.624962,1237257.0,408738.0,1
215146,705333,VCI,1,172,212,12.8,98,98,98,1,...,1.0,4.148,0,0,0,1237362.0,398320.138155,1237523.0,397222.0,0
1248038,690477,VCI,5,163,137,14.7,79,80,79,6,...,1.017,4.171,0,0,0,1236918.0,409378.011407,1237567.0,408117.0,0
110126,691515,VCI,3,143,105,11.7,79,79,78,6,...,1.0,4.25,0,0,0,1237962.0,410481.602015,1239365.0,409803.0,-1


In [117]:
min_area, max_area = 2, 5
label = 'forest_change_2010_to_2020'
label = 'low_prcp_duration'
filtered_basins = df[(df['drainage_area_km2'] <= max_area) & (df['drainage_area_km2'] > min_area)].copy()
filtered_basins[['ppt_lat_m_3005', 'ppt_lon_m_3005']] = filtered_basins[['ppt_lat_m_3005', 'ppt_lon_m_3005']].round(0)
for i, row in filtered_basins.iterrows():
    match = attributes[(attributes['ppt_lon_m_3005'] == row['ppt_lon_m_3005']) & (attributes['ppt_lat_m_3005'] == row['ppt_lat_m_3005'])].copy()
    if len(match) == 1:
        filtered_basins.loc[i, label] = match[label].values[0]
    else:
        print('no match found')


In [118]:
# attributes = attributes[(attributes['drainage_area_km2'] <= max_area) & (attributes['drainage_area_km2'] > min_area)].copy()
filtered_basins.head()
filtered_basins.set_geometry('basin_geometry', inplace=True)
filtered_basins = filtered_basins.to_crs(4326)

In [119]:
# filtered_polygons = df.iloc[filtered_ids, :].copy()
fb_out = filtered_basins.copy()[[c for c in filtered_basins.columns if c not in ['centroid_geometry', 'geometry']]]
fb_out.to_file('data/VCI_forest_change.geojson')

In [120]:
print(filtered_basins[[label]].max(), filtered_basins[[label]].min())

low_prcp_duration    4.795
dtype: float64 low_prcp_duration    2.977
dtype: float64


In [121]:
filtered_basins = filtered_basins.to_crs(3857)
filtered_basins.geometry = filtered_basins.simplify(200)


In [122]:
filtered_basins.columns
filtered_basins = filtered_basins[['drainage_area_km2', 'basin_geometry', label]]

In [123]:
from bokeh.models import ColorBar, LinearColorMapper, GeoJSONDataSource
from bokeh.io import output_notebook, show
from bokeh.palettes import RdYlGn
from bokeh.plotting import figure
import json
import xyzservices.providers as xyz
# Output to Jupyter Notebook
output_notebook()

In [124]:
geojson_data = json.loads(filtered_basins.to_json())
geo_source = GeoJSONDataSource(geojson=json.dumps(geojson_data))

tiles = xyz['USGS']['USTopo']
tiles = xyz['CartoDB']['Positron']

In [125]:
print(filtered_basins[label].min(), filtered_basins[label].max())

2.977 4.795


In [126]:
# Define a color mapper for continuous data
color_mapper = LinearColorMapper(
    palette=RdYlGn[9][::-1], 
    low=filtered_basins[label].min(), 
    high=filtered_basins[label].max()
)

In [129]:
p = figure(
    title="", 
    width=800, height=600,
    tools='pan,wheel_zoom,reset,hover,save', active_scroll='wheel_zoom',
    x_axis_type="mercator", y_axis_type="mercator",
    x_axis_label='Longitude', y_axis_label='Latitude',
)
p.add_tile(tiles)
# Add polygons to the plot
p.patches('xs', 'ys', source=geo_source,
          fill_color={'field': label, 'transform': color_mapper},
          fill_alpha=0.7, line_alpha=0, line_width=0.5)
# Add a color bar
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, location=(0,0), 
                     # title=r'Forest Cover Change [%]',
                     title=r'Low Precipitation Duration [days]',
                    title_text_font_size='14pt',  # Adjust the title font size
                    title_text_font_style='bold',  # Make the title bold if desired
                     title_text_font='Bitstream Charter',
                     major_label_text_font='Bitstream Charter',
                     major_label_text_font_size='16pt',
                    )
p.xaxis.axis_label_text_font_size = '16pt'
p.yaxis.axis_label_text_font_size = '16pt'
p.xaxis.major_label_text_font_size = '16pt'
p.yaxis.major_label_text_font_size = '16pt'
p.yaxis.axis_label_text_font = "Bitstream Charter"
p.xaxis.axis_label_text_font = "Bitstream Charter"
p.xaxis.major_label_text_font = "Bitstream Charter"
p.yaxis.major_label_text_font = "Bitstream Charter"

p.add_layout(color_bar, 'right')
p.grid.visible = False

In [130]:
# Show the plot
show(p)