## Motivation & Set-Up

### My goal is to compare different selection processes for selecting an optimal number of k-means clusters over two different ecological regions. 


In [63]:
import os       # file paths
import pickle   # for saving/loading data
import re       # regular expressions
import warnings # for ignoring warnings


import cartopy.crs as ccrs  # plotting maps
import earthaccess          # accessing Earth data API
import earthpy as et        # working with spatial data
import geopandas as gpd     # working with geospatial data
import geoviews as gv       # interactive geospatial visualizations
import holoviews as hv      # interactive plotting
import hvplot.pandas        # plotting with pandas dataframes
import hvplot.xarray        # plotting with xarray datasets
hv.extension('bokeh')
import pyproj
import pyogrio              # working with geospatial vector data   

import numpy as np          # working with arrays
import pandas as pd         # working with dataframes
import rioxarray as rxr     # working with geospatial raster data
import rioxarray.merge as rxrmerge  # merging geospatial raster data
from tqdm.notebook import tqdm      # progress bars
from ipywidgets import IntProgress  # progress bars
from IPython.display import display     # displaying widgets
import xarray as xr                 # working with multi-dimensional arrays
from shapely.geometry import Polygon    # working with geometric shapes
from sklearn.cluster import KMeans      # for clustering data
from sklearn.preprocessing import StandardScaler    # scaling data
from kneed import KneeLocator           # working with elbow test

import requests
import zipfile

### set GDAL parameters
os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"

### skip non-critical warnings
warnings.simplefilter('ignore')

> Set directory paths and caching decorator

In [2]:
# modify cache directory for local file paths
cache_dir = os.path.join(
    et.io.HOME,
    "Data",
    "Earth Analytics",
    "clustering-portfolio"
)

In [3]:
### make the caching decorator
def cached(func_key, override=False):
    """
    A decorator to cache function results
    
    Parameters
    ==========
    key: str
      File basename used to save pickled results
    override: bool
      When True, re-compute even if the results are already stored
    """
    def compute_and_cache_decorator(compute_function):
        """
        Wrap the caching function
        
        Parameters
        ==========
        compute_function: function
          The function to run and cache results
        """
        def compute_and_cache(*args, **kwargs):
            """
            Perform a computation and cache, or load cached result.
            
            Parameters
            ==========
            args
              Positional arguments for the compute function
            kwargs
              Keyword arguments for the compute function
            """
            ### Add an identifier from the particular function call
            if 'cache_key' in kwargs:
                key = '_'.join((func_key, kwargs['cache_key']))
            else:
                key = func_key

            ### define a file path based on the directory structure in earthpy
            path = os.path.join(
                
                ## established dir
                cache_dir,
                
                ### make a subdirectory called "jars"
                'jars', 
                
                ### use f-string (formatted string) to create a string by embedding the value
                ### of the variable "key" into the string 
                ### use .pickle file extension (a pickle file is a serialized python objecT)
                f'{key}.pickle')
            
            ### Check if the cache exists already or if we should override caching
            if not os.path.exists(path) or override:
                
                ### Make jars directory if needed
                os.makedirs(os.path.dirname(path), exist_ok=True)
                
                ### Run the compute function as the user did
                result = compute_function(*args, **kwargs)
                
                ### Pickle the object (save to file)
                ### open the file at filename
                with open(path, 'wb') as file:
                    
                    ### save the result without needing to recompute when loading
                    ### it back into Python
                    pickle.dump(result, file)
            
            ### if the file already exists/we are not overriding the cache
            else:
               
                ### Unpickle the object (load the cached result)
                with open(path, 'rb') as file:
                    
                    ### use pickle.load to un-serialize the file back into a python object
                    result = pickle.load(file)
                    
            return result
        
        return compute_and_cache
    
    return compute_and_cache_decorator

## WPD Boundaries

In [9]:
HUC_LEVEL = 12

# folder where files should be saved
download_dir = os.path.join(cache_dir, 
                            "downloads",
                            "WBD_08_HU2_Shape")
os.makedirs(download_dir, exist_ok=True)

@cached(f'wpd_08_hu{HUC_LEVEL}_gdf', override=True)
def read_wpd(wpd_filename, cache_key=None):
    """Download WBD shapefile directly to custom folder and return GeoDataFrame"""

    # URL to zip file
    url = f"https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/WBD/HU2/Shape/{wpd_filename}.zip"

    # local zip path
    zip_path = os.path.join(download_dir, f"{wpd_filename}.zip")

    # download if not already downloaded
    if not os.path.exists(zip_path):
        r = requests.get(url)
        r.raise_for_status()
        with open(zip_path, "wb") as f:
            f.write(r.content)

    # unzip into download_dir
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(download_dir)

    # shapefile path inside the extracted folder
    shp_path = os.path.join(download_dir, "Shape", f"WBDHU{HUC_LEVEL}.shp")

    # read shapefile
    gdf = gpd.read_file(shp_path, engine="pyogrio")
    return gdf

In [None]:
# open and test the gdf
wpd_gdf = read_wpd("WBD_08_HU2_Shape")
wpd_gdf

Unnamed: 0,tnmid,metasource,sourcedata,sourceorig,sourcefeat,loaddate,referenceg,areaacres,areasqkm,states,...,name,hutype,humod,tohuc,noncontrib,noncontr_1,shape_Leng,shape_Area,ObjectID,geometry
0,{8AFB1AF9-7296-4303-89DE-14CD073B859A},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,535297540579,29441.81,119.15,LA,...,Gourd Bayou-Youngs Bayou,S,"LE,ID,DD",080500011308,0.0,0.0,,,1,"POLYGON ((-92.00021 32.53586, -91.99994 32.535..."
1,{916A17A6-B4A0-4FD7-9BB8-FFD1936B15B2},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,535512,11406.67,46.16,LA,...,Hams Creek,S,ID,080802050104,0.0,0.0,,,2,"POLYGON ((-93.37574 30.58982, -93.3747 30.5891..."
2,{493C7EC1-2F1C-4B84-AFFB-6F6868A9868E},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,547190559640,29138.21,117.92,LA,...,Caney Creek-Bayou D'Arbonne,S,NM,080402060503,0.0,0.0,,,3,"POLYGON ((-93.07761 32.88752, -93.07784 32.887..."
3,{49A3C087-B460-4F97-9D99-78CBB675248B},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,7741778285,17759.39,71.87,AR,...,L'Aigle Creek-Saline River,S,NM,080402020206,0.0,0.0,,,4,"POLYGON ((-92.08947 33.29383, -92.0897 33.2938..."
4,{0FB41498-11EA-4AB1-AF05-E2A8E5E2E274},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,1628466,98564.62,398.88,LA,...,West Cote Blanche Bay,W,NM,080801030800,0.0,0.0,,,5,"POLYGON ((-91.62408 29.73947, -91.62195 29.737..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2636,{9E524E78-2605-48CB-A41F-618AFCDF513D},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,695171692611,9132.46,36.96,MS,...,Widow Bayou-Big Sunflower River,S,"AW,LE,TF",080302071707,0.0,0.0,,,2637,"POLYGON ((-90.76273 32.97428, -90.76209 32.973..."
2637,{C11913D9-C534-4755-884C-4CAD470ED143},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,7728081842,20587.50,83.31,AR,...,Lindermans Lake-Bayou Des Arc,S,LE,080203010306,0.0,0.0,,,2638,"POLYGON ((-91.73427 34.99197, -91.7342 34.9923..."
2638,{3EEBF422-01AC-4322-A63C-24C0A34E1E4F},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,714675,21405.46,86.62,MO,...,Brewer Lake,S,"LE,DD,IT,TF",080103000102,0.0,0.0,,,2639,"POLYGON ((-89.13715 36.97285, -89.13387 36.970..."
2639,{4734715C-0F4A-4211-BBAE-86605B20B79A},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,547104553857,38848.09,157.21,LA,...,Blounts Creek-Calcasieu River,S,ID,080802030302,0.0,0.0,,,2640,"POLYGON ((-92.75965 31.12593, -92.75881 31.125..."


In [15]:
wpd_gdf.columns

Index(['tnmid', 'metasource', 'sourcedata', 'sourceorig', 'sourcefeat',
       'loaddate', 'referenceg', 'areaacres', 'areasqkm', 'states', 'huc12',
       'name', 'hutype', 'humod', 'tohuc', 'noncontrib', 'noncontr_1',
       'shape_Leng', 'shape_Area', 'ObjectID', 'geometry'],
      dtype='object')

> The HUC 12 area we initially considered is 080902030506, just south of New Orleans. I plan to pick a close by region within the same larger HUC 8 area, 08090203 but need to add a huc8 column.

In [16]:
wpd_gdf['huc8'] = wpd_gdf['huc12'].str[:8]


In [20]:
# filter to the HUC 8 region we want

huc8_code = "08090203"
wpd_sub = wpd_gdf[wpd_gdf['huc8']==huc8_code]

wpd_sub

Unnamed: 0,tnmid,metasource,sourcedata,sourceorig,sourcefeat,loaddate,referenceg,areaacres,areasqkm,states,...,hutype,humod,tohuc,noncontrib,noncontr_1,shape_Leng,shape_Area,ObjectID,geometry,huc8
13,{05D9DFD7-8B31-43D9-89E9-753B0BD0BF7E},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,1628039559852,14549.47,58.88,LA,...,D,"GC,TF",080902030702,0.0,0.0,,,14,"POLYGON ((-89.75972 29.84024, -89.75749 29.840...",8090203
139,{A8E00DBB-488D-4A09-AB70-ACE37E07E7DD},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,5583061629919,32730.99,132.46,LA,...,F,NM,080902030400,0.0,0.0,,,140,"POLYGON ((-89.62701 30.14982, -89.62697 30.149...",8090203
228,{ABA4F6E1-8607-47AD-9476-0431E5779706},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,558302560269,24313.37,98.39,LA,...,F,GC,080902030800,0.0,0.0,,,229,"POLYGON ((-89.70818 29.72359, -89.70676 29.723...",8090203
555,{FF72496C-9776-4025-B913-F3F0A6B808FD},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,558302,23652.9,95.72,LA,...,D,"LE,GC",080902030505,0.0,0.0,,,556,"POLYGON ((-89.98997 29.77, -89.98945 29.76865,...",8090203
717,{FEC68B1D-76FD-4C80-816B-A70A7615DA41},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,534652533690,25536.55,103.34,LA,...,S,"LE,TF,UA",080902030103,0.0,0.0,,,718,"POLYGON ((-90.15505 30.01946, -90.15644 29.959...",8090203
915,{FA42A3B0-3B37-48DA-9C60-F64A2455B809},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,560223,23095.47,93.46,LA,...,D,"BC,GC",080902030800,0.0,0.0,,,916,"POLYGON ((-89.60389 29.82618, -89.60091 29.824...",8090203
947,{910C4369-7D03-4B59-BB84-A53822D1489D},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,554577532674,39972.91,161.76,LA,...,D,GC,080902030702,0.0,0.0,,,948,"POLYGON ((-89.79177 29.7609, -89.78905 29.7594...",8090203
1016,{57EE97C1-00EF-4288-BFCE-17FB7205BBD1},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,16273251628217,39165.11,158.5,LA,...,W,NM,080902030800,0.0,0.0,,,1017,"POLYGON ((-89.27418 30.02194, -89.27501 30.025...",8090203
1066,{C4358575-2325-4BCA-97F4-F8B894DD2FF6},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,532770559668,27204.0,110.09,LA,...,F,"LE,BC,TF",080902030302,0.0,0.0,,,1067,"POLYGON ((-89.94415 30.0758, -89.9409 30.07838...",8090203
1472,{5D9D29E6-DCCB-49B4-9174-E3FDDA29E869},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,559872560760,10652.63,43.11,LA,...,D,NM,080902030603,0.0,0.0,,,1473,"POLYGON ((-89.45708 29.97146, -89.45493 29.970...",8090203


In [None]:
# original huc12 code
past_huc_code = "080902030506"


# separate gdf for plotting to be safe
wpd_plot = wpd_sub.copy()
# highlight original area of study
wpd_plot['color'] = wpd_plot['huc12'].apply(
    lambda x: 'grey' if x == '080902030506' 
            else 'yellow' if x == '080902030502' or x == '080902030501'
            else 'lightgrey'
)

#
project = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)

x0, y0 = project.transform(-90.1, 29.5)
x1, y1 = project.transform(-89.6, 30.0)

hv_map = wpd_plot.hvplot(
    geo=True,
    color='color',        # use the color column
    line_color='black',
    line_width=0.5,
    alpha = .4,
    width=600,
    height=600,
    tiles = 'EsriImagery',
    hover_cols=['huc12']

# lat and long bounds
).opts(
    xlim = (x0, x1),
    ylim = (y0, y1)
)

hv_map

> The Huc12 regions north of our original area of study will be my focus. Region 080902030502 seems most similar but with likely a higher percentage of land cover. Region 080902030501 looks more urban which could be interesting as well once the workflow is established. I anticipate '01' will have the highest variability between clusters and between runs of K-means.  '02' will be the primary focus for evaluating k-means selection since i believe it will have slightly more variability than the more water-dominated 06 region initially considered. 

In [43]:
delta6_gdf = wpd_gdf[wpd_gdf[

    ### filter the gdf to the row(s) with the watershed we want
    ### use "dissolve" to merge the geometries of all the rows matching the target watershed

    f'huc{HUC_LEVEL}'].isin(['080902030506'])].dissolve()

delta6_gdf

Unnamed: 0,geometry,tnmid,metasource,sourcedata,sourceorig,sourcefeat,loaddate,referenceg,areaacres,areasqkm,...,name,hutype,humod,tohuc,noncontrib,noncontr_1,shape_Leng,shape_Area,ObjectID,huc8
0,"POLYGON ((-89.97047 29.74687, -89.96593 29.750...",{E942B72E-599E-48F5-908A-EA5265701C14},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,536881539539,37355.86,151.17,...,Manuel Canal-Spanish Lake,D,GC,80902030508,0.0,0.0,,,2560,8090203


In [44]:
delta2_gdf = wpd_gdf[wpd_gdf[

    ### filter the gdf to the row(s) with the watershed we want
    ### use "dissolve" to merge the geometries of all the rows matching the target watershed

    f'huc{HUC_LEVEL}'].isin(['080902030502'])].dissolve()

delta2_gdf

Unnamed: 0,geometry,tnmid,metasource,sourcedata,sourceorig,sourcefeat,loaddate,referenceg,areaacres,areasqkm,...,name,hutype,humod,tohuc,noncontrib,noncontr_1,shape_Leng,shape_Area,ObjectID,huc8
0,"POLYGON ((-89.90424 29.85416, -89.87284 29.854...",{B32BD6BC-5CAE-4E86-84C0-86BA836C14BD},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,560529,17903.81,72.45,...,Creedmore Canal-Fourty Arpent Canal,D,"LE,TF,GC",80902030505,0.0,0.0,,,1631,8090203


In [None]:
urban_gdf = wpd_gdf[wpd_gdf[

    ### filter the gdf to the row(s) with the watershed we want
    ### use "dissolve" to merge the geometries of all the rows matching the target watershed

    f'huc{HUC_LEVEL}'].isin(['080902030501'])].dissolve()

urban_gdf

Unnamed: 0,geometry,tnmid,metasource,sourcedata,sourceorig,sourcefeat,loaddate,referenceg,areaacres,areasqkm,...,name,hutype,humod,tohuc,noncontrib,noncontr_1,shape_Leng,shape_Area,ObjectID,huc8
0,"POLYGON ((-90.02668 29.9636, -90.02595 29.9663...",{31101660-B283-4D40-8AC3-62A1A2487FD4},{511D2AC8-11BA-45FC-AB98-F69D693D4C44},Watershed Boundary Dataset (WBD),Natural Resources and Conservation Service and...,,2024-08-15,560090,20037.95,81.09,...,Bayou Terre aux Boeufs-Fourty Arpent Canal,D,"LE,TF",80902030502,0.0,0.0,,,1857,8090203


## Multi-Spectral Data

In [33]:
### Log in to earthaccess
earthaccess.login(persist = True)

<earthaccess.auth.Auth at 0x1175f14d0>

In [45]:
### Search for HLS granules we want
results = earthaccess.search_data(

    ### specify which dataset and spatial resolution we want 
    short_name = 'HLSL30',

    ### specify that we're using cloud data
    cloud_hosted = True,

    ### use the bounding box from our watershed boundary
    bounding_box = tuple(delta2_gdf.total_bounds),

    ### set the temporal range of the data
    temporal = ('2024-06', '2024-08')
)

In [46]:
results

[Collection: {'EntryTitle': 'HLS Landsat Operational Land Imager Surface Reflectance and TOA Brightness Daily Global 30m v2.0'}
 Spatial coverage: {'HorizontalSpatialDomain': {'Geometry': {'GPolygons': [{'Boundary': {'Points': [{'Longitude': -89.79864173, 'Latitude': 29.70347853}, {'Longitude': -89.76643746, 'Latitude': 30.69278312}, {'Longitude': -90.91181412, 'Latitude': 30.71627038}, {'Longitude': -90.93262544, 'Latitude': 29.72659663}, {'Longitude': -89.79864173, 'Latitude': 29.70347853}]}}]}}}
 Temporal coverage: {'RangeDateTime': {'BeginningDateTime': '2024-06-07T16:31:11.509Z', 'EndingDateTime': '2024-06-07T16:31:11.509Z'}}
 Size(MB): 169.50417041778564
 Data: ['https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T15RYP.2024159T163111.v2.0/HLS.L30.T15RYP.2024159T163111.v2.0.B10.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T15RYP.2024159T163111.v2.0/HLS.L30.T15RYP.2024159T163111.v2.0.SAA.tif', 'https://data.l

In [39]:
### make a function to process all the granules from the earthaccess search
### and extract information for each granule

### define the function
def get_earthaccess_links(results):

    ### make and display a progress bar
    progress_bar = IntProgress(min=0, max = len(results), description='Open granules:')
    display(progress_bar)

    ### use a regular expression to extract tile_id and bank from .tif files
    url_re = re.compile(
        r'\.(?P<tile_id>\w+)\.\d+T\d+\.v\d\.\d\.(?P<band>[A-Za-z0-9]+)\.tif')

    ### accumulate gdf rows from each granule
    link_rows = []    

    ### loop over granules to extract info
    for granule in results:

        ### locate metadata (UMM = universal metadata model)
        info_dict = granule['umm']

        ### pull out unique identifier for the granule
        granule_id = info_dict['GranuleUR']

        ### extract date/time 
        datetime = pd.to_datetime(
            info_dict['TemporalExtent']['RangeDateTime']['BeginningDateTime']
        )

        ### extract boundary coordinates for granule
        points = (
            info_dict
            ['SpatialExtent']['HorizontalSpatialDomain']['Geometry']['GPolygons'][0]
            ['Boundary']['Points']
        )

        ### make polygon using coordinate points for granule
        geometry = Polygon(
            [(point['Longitude'], 
              point['Latitude']) for point in points]
        )

        ### get url and open granule
        files = earthaccess.open([granule])

        ### loop through each file in the granule
        for file in files:

            ### use url regular expression to get url
            match = url_re.search(file.full_name)

            ### if match is found, append data to link_rows gdf we initialized
            if match is not None:
                link_rows.append(

                    ### makes a gdf with the granule's data and geometry
                    gpd.GeoDataFrame(
                        dict(

                            # timestamp
                            datetime = [datetime],
                           
                            # unique tile ID
                            tile_id = [match.group('tile_id')], 

                            # band 
                            band = [match.group('band')],

                            # url
                            url = [file],

                            # polygon
                            geometry = [geometry]
                        ),

                        crs = 'EPSG:4326'
                    
                    )
                )
        ### update progress bar after each granule is done
        progress_bar.value += 1

    ### combine into a single gdf   
    file_df = pd.concat(link_rows).reset_index(drop = True)

    ### return the final gdf file
    return file_df

In [40]:
# look at one granule
test_granule = results[0]

test_granule

In [41]:
# look at umm
info_dict = test_granule['umm']
info_dict

{'TemporalExtent': {'RangeDateTime': {'BeginningDateTime': '2024-06-07T16:31:11.509Z',
   'EndingDateTime': '2024-06-07T16:31:11.509Z'}},
 'GranuleUR': 'HLS.L30.T15RYP.2024159T163111.v2.0',
 'AdditionalAttributes': [{'Name': 'LANDSAT_PRODUCT_ID',
   'Values': ['LC08_L1TP_022039_20240607_20240607_02_RT']},
  {'Name': 'CLOUD_COVERAGE', 'Values': ['10']},
  {'Name': 'MGRS_TILE_ID', 'Values': ['15RYP']},
  {'Name': 'SPATIAL_COVERAGE', 'Values': ['100']},
  {'Name': 'SPATIAL_RESOLUTION', 'Values': ['30.0']},
  {'Name': 'SPATIAL_RESAMPLING_ALG', 'Values': ['Cubic Convolution']},
  {'Name': 'HLS_PROCESSING_TIME', 'Values': ['2024-06-21T18:26:46Z']},
  {'Name': 'SENSING_TIME', 'Values': ['2024-06-07T16:31:11.5093050Z']},
  {'Name': 'HORIZONTAL_CS_NAME', 'Values': ['UTM, WGS84, UTM ZONE 15']},
  {'Name': 'ULX', 'Values': ['699960.0']},
  {'Name': 'ULY', 'Values': ['3400020.0']},
  {'Name': 'ADD_OFFSET', 'Values': ['0']},
  {'Name': 'REF_SCALE_FACTOR', 'Values': ['0.0001']},
  {'Name': 'THERM_SC

In [None]:
# run the function to get granule search results

# set path to save file
links_path = os.path.join(
    cache_dir,
    "downloads",
    'links_file_df.gpkg'
)

if os.path.exists(links_path):
    links_file_df = gpd.read_file(links_path)
else:    
    links_file_df = get_earthaccess_links(results)
    os.makedirs(os.path.dirname(links_path), exist_ok=True)
    links_file_df.to_file(links_path, driver = "GPKG")

In [48]:
links_file_df.head()

Unnamed: 0,datetime,tile_id,band,url,geometry
0,2024-06-07 16:31:11.509000+00:00,T15RYP,B10,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.79864 29.70348, -89.76644 30.692..."
1,2024-06-07 16:31:11.509000+00:00,T15RYP,SAA,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.79864 29.70348, -89.76644 30.692..."
2,2024-06-07 16:31:11.509000+00:00,T15RYP,VZA,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.79864 29.70348, -89.76644 30.692..."
3,2024-06-07 16:31:11.509000+00:00,T15RYP,B06,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.79864 29.70348, -89.76644 30.692..."
4,2024-06-07 16:31:11.509000+00:00,T15RYP,B09,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.79864 29.70348, -89.76644 30.692..."


In [49]:
### apply cached decorator to function
@cached('delta_reflectance_da_df')


### write function that computes reflectance data using 
### search results (df of urls) and watershed boundary
def compute_reflectance_da(search_results, boundary_gdf):

    """
    Connect to files using VSI, crop the, apply a cloud mask, and wrangle the data.

    Return a single reflectance DF with bands as columns and centroid coords & datetime as index

    Parameters
    ==========
    search_results_list: list
        Search result links to the files (urls)
    boundary_gdf: gpd.GeoDataFrame
        Boundary used to crop the data
    """

    ### write a function to open raster from url, apply scale factor, crop, and mask data
    def open_dataaray(url, boundary_proj_gdf, scale = 1, masked = True):

        # open raster data
        da = rxr.open_rasterio(url, masked = masked).squeeze() * scale

        # reproject boundary if needed to match raster crs
        if boundary_proj_gdf is None:
            boundary_proj_gdf = boundary_gdf.to_crs(da.rio.crs)

        # crop raster to bounding box
        cropped = da.rio.clip_box(*boundary_proj_gdf.total_bounds)

        return cropped
        
    ### write function to apply a cloud mask
    def compute_quality_mask(da, mask_bits = [1, 2, 3]):

        """Mask out low quality data by bit"""

        # unpack the bits to a new axis
        bits = (

            # unpack each number into individual bits
            np.unpackbits(

                # convert to 8-bit unsigned int format
                da.astype(np.uint8),

                # set order of bits
                bitorder = 'little'

            # reshape to match original data with extra dim for bits
            ).reshape(da.shape + (-1,))
        )

        # select desired bits
        selected_bits = bits[..., mask_bits]

        # check they are all zero
        mask = np.all(selected_bits == 0, axis=-1)


        ### return the mask
        return mask
    

    ### grab metadata
    file_df = get_earthaccess_links(search_results)

    # store results for each granule
    granule_da_rows = []    

    # store projected boundary
    boundary_proj_gdf = None
    
    # group the data by granule
    group_iter = file_df.groupby(

        # datetime and tile_id
        ['datetime', 'tile_id']
    )

    ## loop through each image and its metadata
    for (datetime, tile_id), granule_df in tqdm(group_iter):

        # print status bar
        print(f'Processing granule {tile_id} {datetime}')

        # find each granule's cloud mask file (fmask) url
        cloud_mask_url = (
            granule_df.loc[granule_df.band == 'Fmask', 'url']
            .values[0])

        # open granule cloud mask
        cloud_masked_cropped_da = open_dataaray(cloud_mask_url, boundary_proj_gdf, masked = False)

        ### compute cloud mask
        cloud_mask = compute_quality_mask(cloud_masked_cropped_da)

        ### loop through each spectral band to open, crop, and mask the band
        da_list = []
        df_list = []

        # loop through each band in granule
        for i, row in granule_df.iterrows():

            # only loop through spectral bands
            if row.band.startswith('B'):

                # open band's raster and scale to reflectance data
                band_cropped = open_dataaray(
                    row.url, boundary_proj_gdf, scale = .0001)

                # name the raster by the band
                band_cropped.name = row.band    

                # apply the cloud mask to the raster
                row['da'] = band_cropped.where(cloud_mask)
                
                ### append the row to granule_da_rows
                granule_da_rows.append(row.to_frame().T)

    ### reassemble the metadata df
    return pd.concat(granule_da_rows)


In [50]:
reflectance_da_df = compute_reflectance_da(results, delta2_gdf)

IntProgress(value=0, description='Open granules:', max=44)

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Processing granule T15RYN 2024-06-07 16:31:11.509000+00:00
Processing granule T15RYP 2024-06-07 16:31:11.509000+00:00
Processing granule T16RBT 2024-06-07 16:31:11.509000+00:00
Processing granule T16RBU 2024-06-07 16:31:11.509000+00:00
Processing granule T15RYN 2024-06-15 16:31:19.154000+00:00
Processing granule T15RYP 2024-06-15 16:31:19.154000+00:00
Processing granule T16RBT 2024-06-15 16:31:19.154000+00:00
Processing granule T16RBU 2024-06-15 16:31:19.154000+00:00
Processing granule T15RYN 2024-06-23 16:31:21.277000+00:00
Processing granule T15RYP 2024-06-23 16:31:21.277000+00:00
Processing granule T16RBT 2024-06-23 16:31:21.277000+00:00
Processing granule T16RBU 2024-06-23 16:31:21.277000+00:00
Processing granule T15RYN 2024-07-01 16:31:17.338000+00:00
Processing granule T15RYP 2024-07-01 16:31:17.338000+00:00
Processing granule T16RBT 2024-07-01 16:31:17.338000+00:00
Processing granule T16RBU 2024-07-01 16:31:17.338000+00:00
Processing granule T15RYN 2024-07-09 16:31:29.187000+00:

In [52]:
reflectance_da_df.head()

Unnamed: 0,datetime,tile_id,band,url,geometry,da
45,2024-06-07 16:31:11.509000+00:00,T15RYN,B04,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.82661214 28.80213717, -89.795837...",[[<xarray.DataArray 'B04' ()> Size: 4B\narray(...
48,2024-06-07 16:31:11.509000+00:00,T15RYN,B06,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.82661214 28.80213717, -89.795837...",[[<xarray.DataArray 'B06' ()> Size: 4B\narray(...
49,2024-06-07 16:31:11.509000+00:00,T15RYN,B03,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.82661214 28.80213717, -89.795837...",[[<xarray.DataArray 'B03' ()> Size: 4B\narray(...
50,2024-06-07 16:31:11.509000+00:00,T15RYN,B07,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.82661214 28.80213717, -89.795837...",[[<xarray.DataArray 'B07' ()> Size: 4B\narray(...
51,2024-06-07 16:31:11.509000+00:00,T15RYN,B02,"<File-like object HTTPFileSystem, https://data...","POLYGON ((-89.82661214 28.80213717, -89.795837...",[[<xarray.DataArray 'B02' ()> Size: 4B\narray(...


## Merge Temporal Data

In [53]:
### apply cache decorator
@cached('delta_reflectance_da')

### create a function to merge and composite reflectance data from multiple granules
### end result: single, composite reflectance image for each spectral band
def merge_and_composite_arrays(granule_da_df):

    ### initialize a list to store composites after processing
    da_list = []    

    ### loop over each spectral band
    for band, band_df in granule_da_df.groupby('band'):

        # list for storing merged data arrays (one per date)
        merged_das = []

        ### loop over date/time of image acquisition and merge granules for each date
        for datetime, date_df in band_df.groupby('datetime'):
           
            # merge granules for each date
            merged_da = rxrmerge.merge_arrays(list(date_df.da))

            ### mask negative values (could be no data or invalid data)
            merged_da = merged_da.where(merged_da > 0)
            
            ### append to merged_das list we initialized
            merged_das.append(merged_da)
            
        ### composite images across dates
        composite_da = xr.concat(merged_das,
                                # make a datetime dim
                                # calculate median value across datetimes for pixel
                                dim = 'datetime').median('datetime')
        
        # assign band number to attribute of composite data array
        composite_da['band'] = int(band[1:])

        # name the composite data array
        composite_da.name = 'reflectance'

        ### add processed and composite data array to list
        da_list.append(composite_da)


    ### concatenates composite data arrays for each band along band dimension
    return xr.concat(da_list, dim = 'band')

In [54]:
reflectance_da = merge_and_composite_arrays(reflectance_da_df)

In [55]:
reflectance_da

## K-Means Clustering

In [59]:
# establish model df and confirm correct formatting
model_df = (reflectance_da
            
            # flatten da into long df
            .to_dataframe()

            # select reflectance column
            .reflectance

            # make the table wide: each row will be a pixel location
            # each column is a spectral band with reflectance value
            .unstack('band')
            )

### filter out rows with no data
model_df = model_df.drop(columns = [10, 11]).dropna()
model_df

Unnamed: 0_level_0,band,1,2,3,4,5,6,7,9
y,x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3.307669e+06,813185.60451,0.00990,0.01710,0.04060,0.03450,0.13160,0.05700,0.03160,0.00080
3.307669e+06,813215.60451,0.00870,0.01550,0.03100,0.02740,0.07870,0.03700,0.02210,0.00100
3.307669e+06,813245.60451,0.00795,0.01440,0.02890,0.02380,0.06700,0.02890,0.01630,0.00100
3.307669e+06,813275.60451,0.00830,0.01180,0.02170,0.01870,0.04830,0.02200,0.01410,0.00090
3.307669e+06,813305.60451,0.01420,0.02075,0.03495,0.03350,0.05825,0.03675,0.02405,0.00070
...,...,...,...,...,...,...,...,...,...
3.297439e+06,800165.60451,0.01805,0.02475,0.04320,0.03590,0.06010,0.04285,0.02365,0.00090
3.297439e+06,800195.60451,0.01780,0.02400,0.04310,0.03850,0.05365,0.02650,0.01885,0.00085
3.297439e+06,800225.60451,0.01840,0.02320,0.04015,0.03315,0.05390,0.02730,0.01900,0.00075
3.297409e+06,799595.60451,0.02630,0.03300,0.05740,0.05440,0.14260,0.10370,0.05620,0.00080


> Now we can start to evaluate the optimal k-means clustering by the elbow method, which plots 'inertia' vs the number of clusters. Inertia is the sum of squared distances from each point to the assigned cluster center. Inertia is a measure of cluster 'quality' but if we increase k to equal our data points we can theoretically over-fit to every datapoint, but by looking for the 'elbow' where adding more clusters starts to decrease inertia more slowly.

In [75]:
# select only feature columns
feature_cols = model_df.columns.difference(['x','y'])
Kmodel_unscaled = model_df[feature_cols]

# scale features to impact clustering equally
scaler = StandardScaler()
Kmodel = scaler.fit_transform(Kmodel_unscaled)


# set random sample size to avoid running k-means on full data for each k
sample_size = 20000
if Kmodel.shape[0] > sample_size:
    Ksample = Kmodel[np.random.choice(Kmodel.shape[0], sample_size, replace=False)]
else:
    Ksample = Kmodel

# elbow test
# initialize inertia and k range
inertia = []
k_range = range(1, 15)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=123, n_init=10)
    kmeans.fit(Ksample)
    inertia.append(kmeans.inertia_)

# df for plot
inertia_df = pd.DataFrame({
    'k': list(k_range),
    'inertia': inertia
})

knee = KneeLocator(k_range, inertia, curve='convex', direction='decreasing')
optimal_k = knee.knee
print(f"Optimal number of clusters suggested by elbow method: {optimal_k}")

line = inertia_df.hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Method for Optimal k',
    xlabel='Number of clusters (k)',
    ylabel='Inertia (Within-cluster sum of squares)'
)

points = inertia_df.hvplot.scatter(
    x='k',
    y='inertia',
    size=8
)

k_val = optimal_k
inertia_val = inertia_df.loc[inertia_df['k'] == k_val, 'inertia'].values[0]
highlight = hv.Points([(k_val, inertia_val)]).opts(color='red', size=10)

elbow_plot = line * points * highlight
elbow_plot

Optimal number of clusters suggested by elbow method: 4


In [76]:
hv.save(elbow_plot, "img/elbow_output.html")