import libraries and read data

In [1]:
# we have to install the development version for the time being
#!pip uninstall eodal -y
#!pip install git+https://github.com/lukasValentin/eodal@landsat-dev

#!pip install --upgrade git+https://github.com/EOA-team/eodal
#!pip install --upgrade planetary-computer

# Print EOdal version
#import eodal
#print("Version of EOdal: " + eodal.__version__)

In [2]:
# Import general libraries
from pathlib import Path
from matplotlib import pyplot as plt
import numpy as np

# Import EOdal libraries
from eodal.core.band import Band
from eodal.core.raster import RasterCollection

# Import scikit-learn libraries
import sklearn
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import pairwise_distances
from sklearn.cluster import MeanShift
from sklearn.preprocessing import StandardScaler

In [3]:
#year to cluster
year = 2021

In [4]:
#read tif from disk:
fpath_raster = Path(f'S:\MSc_23_TimckeFinn\data\EOdal\landsat_median_composite_{year}_smallAOI.tif')

# initialize a RasterCollection from the .tif file
composite = RasterCollection.from_multi_band_raster(fpath_raster=fpath_raster,
                                                    band_names_dst= ['blue_median','green_median','red_median','nir08_median','swir16_median','swir22_median'],
                                                    band_aliases = ["blue", "green", "red", "nir_1", "swir_1","swir_2"])

# calculate some SI
composite.calc_si('NDVI', inplace = True)
composite.calc_si('NDWI', inplace = True)
composite.calc_si('EVI', inplace = True)
composite.calc_si('CI_GREEN', inplace = True)
composite.calc_si('MSAVI', inplace = True)
composite.calc_si('BSI', inplace = True)

# print band summaries
composite.band_summaries()

Unnamed: 0,min,mean,std,max,count,geometry,band_name
0,0.07744,0.08081,0.001985,0.095985,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",blue_median
1,0.08288,0.089742,0.004197,0.10794,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",green_median
2,0.07888,0.084373,0.003764,0.1142,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",red_median
3,0.08757,0.180042,0.024032,0.24867,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",nir08_median
4,0.078815,0.12362,0.014078,0.1833,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",swir16_median
5,0.07617,0.095905,0.007857,0.15162,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",swir22_median
6,0.018176,0.355873,0.069525,0.494187,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",NDVI
7,-0.455743,-0.32898,0.067513,0.002109,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",NDWI
8,0.008038,0.220245,0.05008,0.359439,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",EVI
9,-0.004209,1.003759,0.23413,1.674734,26928.0,"POLYGON ((874635.000 54285.000, 874635.000 582...",CI_GREEN


In [5]:
# Create a DataFrame from the RasterCollection
gdf = composite.to_dataframe()

gdf.describe()

Unnamed: 0,blue_median,green_median,red_median,nir08_median,swir16_median,swir22_median,NDVI,NDWI,EVI,CI_GREEN,MSAVI,BSI
count,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0
mean,0.08081,0.089742,0.084373,0.180042,0.12362,0.095905,0.355873,-0.32898,0.220245,1.003759,0.158992,-0.111808
std,0.001985,0.004197,0.003764,0.024032,0.014078,0.007857,0.069526,0.067514,0.050081,0.234134,0.037198,0.028711
min,0.07744,0.08288,0.07888,0.08757,0.078815,0.07617,0.018176,-0.455743,0.008038,-0.004209,0.005467,-0.19475
25%,0.079165,0.08595,0.08127,0.170084,0.11572,0.090594,0.344024,-0.360051,0.204336,0.940021,0.146305,-0.129043
50%,0.08042,0.088865,0.08324,0.1816,0.122463,0.093943,0.36608,-0.34002,0.224457,1.030394,0.161715,-0.116296
75%,0.08214,0.093205,0.08683,0.19427,0.13307,0.101301,0.389352,-0.319733,0.247541,1.125247,0.17954,-0.101717
max,0.095985,0.10794,0.1142,0.24867,0.1833,0.15162,0.494187,0.002109,0.359439,1.674734,0.267498,0.053107


In [6]:
from sklearn.preprocessing import MinMaxScaler

selected_columns = ['blue_median','green_median','red_median','nir08_median','swir16_median','swir22_median','NDVI','NDWI','EVI','CI_GREEN','MSAVI','BSI']

## Step 1: Extract the feature columns from the GeoDataFrame into a NumPy array
features = gdf[selected_columns].values

## Step 2: Initialize the MinMaxScaler
scaler = MinMaxScaler()

## Step 3: Fit the scaler to the feature data
scaler.fit(features)

## Step 4: Transform the feature data using the fitted scaler
scaled_features = scaler.transform(features)

## Step 5: Replace the original feature values in the GeoDataFrame with the scaled values
gdf[selected_columns] = scaled_features

gdf[selected_columns].describe()


Unnamed: 0,blue_median,green_median,red_median,nir08_median,swir16_median,swir22_median,NDVI,NDWI,EVI,CI_GREEN,MSAVI,BSI
count,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0,26928.0
mean,0.181708,0.273835,0.155528,0.574006,0.428822,0.261562,0.709431,0.276865,0.603889,0.600359,0.585906,0.334635
std,0.107018,0.167472,0.106574,0.149177,0.134737,0.104141,0.146059,0.147459,0.142519,0.139453,0.141962,0.115838
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.093017,0.122506,0.067667,0.51219,0.353209,0.19117,0.68454,0.209003,0.558615,0.562395,0.537488,0.2651
50%,0.16069,0.238827,0.123443,0.583675,0.417739,0.235553,0.730875,0.252753,0.615876,0.616223,0.596296,0.316527
75%,0.253438,0.412011,0.225085,0.662322,0.519261,0.333085,0.779765,0.297062,0.681567,0.672718,0.664323,0.375351
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Get the colors from the "Accent" colormap
cmap = plt.get_cmap('Accent')
colors = cmap.colors
values = np.arange(8)

In [8]:
## Use MeanShift for Clustering:
from sklearn.cluster import estimate_bandwidth

bandwidth_est = estimate_bandwidth(scaled_features)
print(f'estimated bandwidth: {bandwidth_est}')

#define bandwidth for model
bandwidth = bandwidth_est

# create model
ms = MeanShift(bandwidth=bandwidth)

# fit the model to the data
ms.fit(scaled_features)

# Assign cluster labels
cluster_labels = ms.labels_

# Assign cluster labels to GeoDataFrame
gdf[f'MeanShift_cluster_{bandwidth}'] = cluster_labels

# add column cluster_mean_shift as a new Band into the RasterCollection composite
MeanShift_cluster = Band.from_vector(vector_features = gdf,
                                band_name_src = f'MeanShift_cluster_{bandwidth}',
                                geo_info = composite['blue'].geo_info,
                                band_name_dst=f'MeanShift_cluster_{bandwidth}')


composite.add_band(MeanShift_cluster)

## Use K-means for Clustering:

# set the desired number of clusters for KMean, GMM and Spectral clustering
n = 6

# create model
km = KMeans(n_clusters = n, n_init = 'auto') 

# fit the model to the data
km.fit(scaled_features)

# Get the cluster labels
labels = km.labels_

# Calculate the scores
#silhouette = silhouette_score(scaled_features, labels)
#calinski_harabasz = calinski_harabasz_score(scaled_features, labels)
#davies_bouldin = davies_bouldin_score(scaled_features, labels)

# Print the scores
#print(f'kmeans, {n}, Silhouette score, {silhouette}')
#print(f'kmeans, {n}, Calinski-Harabasz score, {calinski_harabasz}')
#print(f'kmeans, {n}, Davies-Bouldin score, {davies_bouldin}')

# Add the cluster labels to the DataFrame and show it
gdf[f'KMeans_{n}_cluster'] = labels
    
# create new band instance from a column in GeoDataFrame
KMeans_cluster = Band.from_vector(vector_features = gdf,
                                band_name_src = f'KMeans_{n}_cluster',
                                geo_info = composite['blue'].geo_info,
                                band_name_dst = f'KMeans_{n}_cluster'
                                )

#add column KMeans_cluster as a new Band into the RasterCollection composite
composite.add_band(KMeans_cluster)

## Use Spectral Clustering for Clustering:

from sklearn.cluster import SpectralClustering

# Assuming 'scaled_features' is your scaled data array
# You need to set the appropriate values for these parameters
n_clusters = n  # Number of clusters you want to find
affinity_matrix = 'nearest_neighbors'  # Type of affinity matrix ('nearest_neighbors' or 'rbf')
random_state = 42  # Random state for reproducibility

# Create a SpectralClustering instance
spectral_clustering = SpectralClustering(n_clusters=n_clusters, affinity=affinity_matrix, random_state=random_state)

# Fit and predict the clusters
cluster_labels = spectral_clustering.fit_predict(scaled_features)

print("Cluster labels:", cluster_labels)

# Assign cluster labels to GeoDataFrame
gdf[f'Spectral_{n_clusters}_cluster'] = cluster_labels

# create new band instance from a column in GeoDataFrame
Spectral_cluster = Band.from_vector(vector_features = gdf,
                                band_name_src = f'Spectral_{n_clusters}_cluster',
                                geo_info = composite['blue'].geo_info,
                                band_name_dst = f'Spectral_{n_clusters}_cluster')

#add column GMM_cluster as a new Band into the RasterCollection composite
composite.add_band(Spectral_cluster)

## Use Gaussian Mixture Models for Clustering:

# create model
gmm = GaussianMixture(n_components = n)

# fit the model to the data
gmm.fit(scaled_features)

# Assign cluster labels
GMM_cluster = gmm.predict(scaled_features)

# Calculate the scores
#silhouette = silhouette_score(scaled_features, labels)
#calinski_harabasz = calinski_harabasz_score(scaled_features, labels)
#davies_bouldin = davies_bouldin_score(scaled_features, labels)

# Print the scores
#print(f'gmm, {n}, Silhouette score, {silhouette}')
#print(f'gmm, {n}, Calinski-Harabasz score, {calinski_harabasz}')
#print(f'gmm, {n}, Davies-Bouldin score, {davies_bouldin}')

# Assign cluster labels to GeoDataFrame
gdf[f'GMM_{n}_cluster'] = GMM_cluster

# create new band instance from a column in GeoDataFrame
GMM_cluster = Band.from_vector(vector_features = gdf,
                                band_name_src = f'GMM_{n}_cluster',
                                geo_info = composite['blue'].geo_info,
                                band_name_dst = f'GMM_{n}_cluster')

#add column GMM_cluster as a new Band into the RasterCollection composite
composite.add_band(GMM_cluster)

estimated bandwidth: 0.34840480434280363
Cluster labels: [4 4 1 ... 1 1 1]


In [9]:
# save as GeoTiff
composite.to_rasterio(f'S:\MSc_23_TimckeFinn\data\EOdal\landsat_median_composite_{year}_smallAOI_2_si_{n}_clusters.tif')

In [10]:
# plot Kmeans clusters
fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)
cmap = plt.cm.colors.ListedColormap(colors[:n])
ticks = list(range(n))

composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
axs[0].set_title(f'RGB {year}')

composite.plot_band(f'KMeans_{n}_cluster', colormap=cmap, discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
axs[1].set_title(f'KMeans_{n}_cluster')

# save as PNG for quick view
fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\KMeans_{n}_cluster_{year}.png')
plt.close(fig)

# plot MeanShift clusters
fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)

composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
axs[0].set_title(f'RGB {year}')

composite.plot_band(f'MeanShift_cluster_{bandwidth}', colormap='Accent', discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
axs[1].set_title(f'MeanShift_cluster with bandwith: {bandwidth}')

# save as PNG for quick view
fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\MS_cluster_{year}_{bandwidth}.png')
plt.close(fig)
 
# plot GMM clusters
fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)
cmap = plt.cm.colors.ListedColormap(colors[:n])
ticks = list(range(n))

composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
axs[0].set_title(f'RGB {year}')

composite.plot_band(f'GMM_{n}_cluster', colormap=cmap, discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
axs[1].set_title(f'GMM_{n}_cluster')

# save as PNG for quick view
fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\GMM_{n}_cluster_{year}.png')
plt.close(fig)

# plot Spectral clusters
fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)
cmap = plt.cm.colors.ListedColormap(colors[:n_clusters])
ticks = list(range(n_clusters))

composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
axs[0].set_title(f'RGB {year}')

composite.plot_band(f'Spectral_{n_clusters}_cluster', colormap=cmap, discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
axs[1].set_title(f'Spectral_{n_clusters}_cluster')

# save as PNG for quick view
fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\Spectral_{n_clusters}_cluster_{year}.png')
plt.close(fig)

  cb.set_ticklabels(user_defined_ticks)
