# Dataset Flex

Create plots using the BCUB dataset.

First, we'll showcase the land cover sets by filtering the Vancouver Island basins for a non-overlapping set of basins in the order of $\sim 10 \text{km}^2$, and we'll plot a heat map to show the greatest change in forest cover.  We'll use the parquet format to demonstrate the benefit of having multiple geometry support.

In the `.parquet` file, there are three geometry columns:

* **pour point**: "centroid_geometry"
* **basin polygon**: "basin_geometry"
* **basin centroid**: "geometry"

When importing with GeoPandas, we must specify the active geometry column, and we can switch between active columns by using the `set_geometry()` attribute function.  Neat!

In [None]:
import os
import pandas as pd
import geopandas as gpd
import geoviews as gv
# import geoviews.feature as gf
gv.extension('bokeh')

In [None]:
BASE_DIR = os.path.dirname(os.getcwd())
BASIN_DIR = os.path.join(BASE_DIR, 'processed_data/BCUB_files/basin_polygons/')

In [None]:
region_code = 'VCI'
# foo = '/home/danbot2/code_5820/large_sample_hydrology/bcub/processed_data/basin_attributes/polygons'
df = gpd.read_parquet(os.path.join(BASIN_DIR, f'{region_code}_basins.parquet'))
# df = gpd.read_parquet(os.path.join(foo, f'{region_code}_basin_geometries.parquet'))
print(f'There are {len(df)} basins in the {region_code} parquet file.  The active geometry column at import is "geometry" (basin centroid)')
# reset the active geometry column to the basin polygon
# df = df.set_geometry('basin_geometry')
# we need to reproject to EPSG 3857 for plotting
df = df.to_crs(4326)
df.head()

In [None]:
# import the basin attributes for VCI
attribute_path = os.path.join(BASE_DIR, 'processed_data/basin_attributes/', 'BCUB_attributes_20240117.csv')
print(attribute_path)
attributes = pd.read_csv(attribute_path)

In [None]:
total_basins = 0
region_codes = list(set(attributes['region_code']))
n = 0
for rc in region_codes:
    n += 1
    region_attrs = attributes[attributes['region_code'] == rc].copy()
    nan_rows = region_attrs[region_attrs.isna().any(axis=1)].copy()
    total_basins += len(region_attrs)
    print(f'{n}. {rc} has attributes describing {len(region_attrs)} basins, with {len(nan_rows)} rows having missing values.')
print(f'{total_basins} basins in total.')

In [None]:
# filter for region_code == 'VCI'
attributes = attributes[attributes['region_code'] == region_code]
attributes.columns

In [None]:
attributes['forest_change_2010_to_2020'] = attributes['land_use_forest_frac_2020'] - attributes['land_use_forest_frac_2010']
# attributes.head()

In [None]:
attributes[['ppt_lat_m_3005', 'ppt_lon_m_3005']] = attributes[['ppt_lat_m_3005', 'ppt_lon_m_3005']].round(0)
attributes.sort_values(['ppt_lon_m_3005', 'ppt_lat_m_3005'], inplace=True)
attributes.tail()

In [None]:
max_area, min_area = 5, 2
label = 'forest_change_2010_to_2020'
label = 'low_prcp_duration'
filtered_basins = df[(df['area'] <= max_area) & (df['area'] > min_area)].copy()
filtered_basins[['ppt_lat_m_3005', 'ppt_lon_m_3005']] = filtered_basins[['ppt_lat_m_3005', 'ppt_lon_m_3005']].round(0)
for i, row in filtered_basins.iterrows():
    match = attributes[(attributes['ppt_lon_m_3005'] == row['ppt_lon_m_3005']) & (attributes['ppt_lat_m_3005'] == row['ppt_lat_m_3005'])].copy()
    if len(match) == 1:
        filtered_basins.loc[i, label] = match[label].values[0]
    else:
        print('no match found')


In [None]:
# attributes = attributes[(attributes['drainage_area_km2'] <= max_area) & (attributes['drainage_area_km2'] > min_area)].copy()
filtered_basins.head()
filtered_basins.set_geometry('basin_geometry', inplace=True)
filtered_basins = filtered_basins.to_crs(4326)

In [None]:
# filtered_polygons = df.iloc[filtered_ids, :].copy()
# filtered_polygons.head()

In [None]:
print(filtered_basins[[label]].max(), filtered_basins[[label]].min())

In [None]:
polygons_element = gv.Polygons(filtered_basins).opts(color=label, cmap='RdYlGn', 
                                                     line_color=None, colorbar=True, clabel='Low precipitation duration [days]',
                                                    )
plot = gv.tile_sources.CartoLight() * polygons_element
plot.opts(width=800, height=600)
