import libraries and read data

In [1]:
# we have to install the development version for the time being
#!pip uninstall eodal -y
#!pip install git+https://github.com/lukasValentin/eodal@landsat-dev

#!pip install --upgrade git+https://github.com/EOA-team/eodal
#!pip install --upgrade planetary-computer

# Print EOdal version
import eodal
print("Version of EOdal: " + eodal.__version__)

Version of EOdal: 0.2.2


In [2]:
# Import general libraries
from pathlib import Path
from matplotlib import pyplot as plt
import numpy as np

# Import scikit-learn libraries
import sklearn
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import pairwise_distances
from sklearn.cluster import MeanShift
from sklearn.preprocessing import StandardScaler

# Import EOdal libraries
import eodal
from eodal.core.band import Band
from eodal.core.raster import RasterCollection

In [3]:
year = 2023

In [4]:
#read tif from disk:
fpath_raster = Path(f'S:\MSc_23_TimckeFinn\data\EOdal\landsat_median_composite_{year}.tif')

# initialize a RasterCollection from the .tif file
composite = RasterCollection.from_multi_band_raster(fpath_raster=fpath_raster, 
                                                    band_aliases = ["blue", "green", "red", "nir_1", "swir_1","swir_2"])
composite.band_summaries()

TypeError: 'NoneType' object is not subscriptable

In [None]:
# mask NAN values
composite.mask(mask = 'blue_median', mask_values = 1.000000e+20, inplace = True)
composite.band_summaries()

## Feature extraction

In [None]:
# calculate some SI
composite.calc_si('NDVI', inplace = True)
composite.calc_si('NDWI', inplace = True)
composite.calc_si('EVI', inplace = True)
composite.calc_si('CI_GREEN', inplace = True)
composite.calc_si('MSAVI', inplace = True)
composite.calc_si('BSI', inplace = True)

# print band summaries
composite.band_summaries()

In [None]:
#plot band combinations
fig, axs = plt.subplots(1, 3, figsize = (20, 10), sharey=True)

composite.plot_multiple_bands(["nir08_median", "red_median", "green_median"], ax=axs[0])
axs[0].set_title("Color Infrared")
# This band combination is also called the near-infrared (NIR) composite. It uses near-infrared (5), red (4), and green (3). Because chlorophyll reflects near-infrared light, this band composition is useful for analyzing vegetation. In particular, areas in red have better vegetation health. Dark areas are water and urban areas are white.

composite.plot_multiple_bands(["swir16_median", "nir08_median", "blue_median"], ax=axs[1])
axs[1].set_title("Agriculture")
# This band combination uses SWIR-1 (6), near-infrared (5), and blue (2). It’s commonly used for crop monitoring because of the use of short-wave and near-infrared. Healthy vegetation appears dark green. But bare earth has a magenta hue.

composite.plot_multiple_bands(["swir22_median", "swir16_median", "red_median"], ax=axs[2])
axs[2].set_title("Short-Wave Infrared")
# The short-wave infrared band combination uses SWIR-2 (7), SWIR-1 (6), and red (4). This composite displays vegetation in shades of green. While darker shades of green indicate denser vegetation, sparse vegetation has lighter shades. Urban areas are blue and soils have various shades of brown.

In [None]:
#plot individual bands:
for band in composite.band_names:
    fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)
    composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
    axs[0].set_title("RGB")
    if band == 'blue_median':
        composite.plot_band(band, colormap="Blues", ax=axs[1])
        axs[1].set_title(band)
    if band == 'green_median':
        composite.plot_band(band, colormap="Greens", ax=axs[1])
        axs[1].set_title(band)
    if band == 'red_median':
        composite.plot_band(band, colormap="Reds", ax=axs[1])
        axs[1].set_title(band)
    if band == 'nir08_median':
        composite.plot_band(band, colormap="Greys_r", ax=axs[1])
        axs[1].set_title(band)
    if band == 'swir16_median':
        composite.plot_band(band, colormap="Greys_r", ax=axs[1])
        axs[1].set_title(band)
    if band == 'swir22_median':
        composite.plot_band(band, colormap="Greys_r", ax=axs[1])
        axs[1].set_title(band)
    if band == 'NDVI':
        composite.plot_band(band, colormap="YlGn", ax=axs[1])
        axs[1].set_title(band)
    if band == 'NDWI':
        composite.plot_band(band, colormap="GnBu", ax=axs[1])
        axs[1].set_title(band)
    if band == 'EVI':
        composite.plot_band(band, colormap="YlGn", ax=axs[1])
        axs[1].set_title(band)
    if band == 'CI_GREEN':
        composite.plot_band(band, colormap="YlGn", ax=axs[1])
        axs[1].set_title(band)
    if band == 'MSAVI':
        composite.plot_band(band, colormap="YlGn", ax=axs[1])
        axs[1].set_title(band)
    if band == 'BSI':
        composite.plot_band(band, colormap="pink_r", ax=axs[1])
        axs[1].set_title(band)


In [None]:
# Create a DataFrame from the RasterCollection
gdf = composite.to_dataframe()

# Select all columns for feature space
selected_columns = ['blue_median', 'green_median', 'red_median', 'nir08_median','swir16_median', 'swir22_median', 'NDVI', 'NDWI', 'EVI', 'CI_GREEN', 'MSAVI', 'BSI']

gdf[selected_columns].head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

## Step 1: Extract the feature columns from the GeoDataFrame into a NumPy array
features = gdf[selected_columns].values

## Step 2: Initialize the MinMaxScaler
scaler = MinMaxScaler()

## Step 3: Fit the scaler to the feature data
scaler.fit(features)

## Step 4: Transform the feature data using the fitted scaler
scaled_features = scaler.transform(features)

## Step 5: Replace the original feature values in the GeoDataFrame with the scaled values
gdf[selected_columns] = scaled_features

gdf[selected_columns].head()

In [None]:
# Get the colors from the "Accent" colormap
cmap = plt.get_cmap('Accent')
colors = cmap.colors

values = np.arange(8)

In [None]:
## Use Spectral Clustering for Clustering:

from sklearn.cluster import SpectralClustering

for n in range(3, 7, 1):
    
    # Assuming 'scaled_features' is your scaled data array
    # You need to set the appropriate values for these parameters
    n_clusters = n  # Number of clusters you want to find
    affinity_matrix = 'nearest_neighbors'  # Type of affinity matrix ('nearest_neighbors' or 'rbf')
    random_state = 42  # Random state for reproducibility

    # Create a SpectralClustering instance
    spectral_clustering = SpectralClustering(n_clusters=n_clusters, affinity=affinity_matrix, random_state=random_state)

    # Fit and predict the clusters
    cluster_labels = spectral_clustering.fit_predict(scaled_features)

    print("Cluster labels:", cluster_labels)

    # Assign cluster labels to GeoDataFrame
    gdf[f'Spectral_{n_clusters}_cluster'] = cluster_labels

    # create new band instance from a column in GeoDataFrame
    Spectral_cluster = Band.from_vector(vector_features = gdf,
                                    band_name_src = f'Spectral_{n_clusters}_cluster',
                                    geo_info = composite['blue'].geo_info,
                                    band_name_dst = f'Spectral_{n_clusters}_cluster')

    #add column GMM_cluster as a new Band into the RasterCollection composite
    composite.add_band(Spectral_cluster)

    #plot clusters
    fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)
    cmap = plt.cm.colors.ListedColormap(colors[:n_clusters])
    ticks = list(range(n_clusters))

    composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
    axs[0].set_title(f'RGB {year}')

    composite.plot_band(f'Spectral_{n_clusters}_cluster', colormap=cmap, discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
    axs[1].set_title(f'Spectral_{n_clusters}_cluster')

    #save as PNG for quick view
    fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\Spectral_{n_clusters}_cluster_{year}.png')

In [None]:
## Use K-means for Clustering:

# set the desired number of clusters for KMean and GMM

for n in range(3, 7, 1):

    # create model
    km = KMeans(n_clusters = n, n_init = 'auto') 

    # fit the model to the data
    km.fit(scaled_features)

    # Get the cluster labels
    labels = km.labels_

    # Calculate the scores
    #silhouette = silhouette_score(scaled_features, labels)
    #calinski_harabasz = calinski_harabasz_score(scaled_features, labels)
    #davies_bouldin = davies_bouldin_score(scaled_features, labels)

    # Print the scores
    #print(f'kmeans, {n}, Silhouette score, {silhouette}')
    #print(f'kmeans, {n}, Calinski-Harabasz score, {calinski_harabasz}')
    #print(f'kmeans, {n}, Davies-Bouldin score, {davies_bouldin}')

    # Add the cluster labels to the DataFrame and show it
    gdf[f'KMeans_{n}_cluster'] = labels
        
    # create new band instance from a column in GeoDataFrame
    KMeans_cluster = Band.from_vector(vector_features = gdf,
                                 band_name_src = f'KMeans_{n}_cluster',
                                 geo_info = composite['blue'].geo_info,
                                 band_name_dst = f'KMeans_{n}_cluster'
                                 )

    #add column KMeans_cluster as a new Band into the RasterCollection composite
    composite.add_band(KMeans_cluster)
    
    #plot clusters
    fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)
    cmap = plt.cm.colors.ListedColormap(colors[:n])
    ticks = list(range(n))

    composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
    axs[0].set_title(f'RGB {year}')

    composite.plot_band(f'KMeans_{n}_cluster', colormap=cmap, discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
    axs[1].set_title(f'KMeans_{n}_cluster')
    
    #save as PNG for quick view
    fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\KMeans_{n}_cluster_{year}.png')


In [None]:
## Use Gaussian Mixture Models for Clustering:

for n in range(3, 7, 1):

    # create model
    gmm = GaussianMixture(n_components = n)

    # fit the model to the data
    gmm.fit(scaled_features)

    # Assign cluster labels
    GMM_cluster = gmm.predict(scaled_features)

    # Calculate the scores
    #silhouette = silhouette_score(scaled_features, labels)
    #calinski_harabasz = calinski_harabasz_score(scaled_features, labels)
    #davies_bouldin = davies_bouldin_score(scaled_features, labels)

    # Print the scores
    #print(f'gmm, {n}, Silhouette score, {silhouette}')
    #print(f'gmm, {n}, Calinski-Harabasz score, {calinski_harabasz}')
    #print(f'gmm, {n}, Davies-Bouldin score, {davies_bouldin}')

    # Assign cluster labels to GeoDataFrame
    gdf[f'GMM_{n}_cluster'] = GMM_cluster

    # create new band instance from a column in GeoDataFrame
    GMM_cluster = Band.from_vector(vector_features = gdf,
                                    band_name_src = f'GMM_{n}_cluster',
                                    geo_info = composite['blue'].geo_info,
                                    band_name_dst = f'GMM_{n}_cluster')

    #add column GMM_cluster as a new Band into the RasterCollection composite
    composite.add_band(GMM_cluster)
    
    #plot clusters
    fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)
    cmap = plt.cm.colors.ListedColormap(colors[:n])
    ticks = list(range(n))

    composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
    axs[0].set_title(f'RGB {year}')

    composite.plot_band(f'GMM_{n}_cluster', colormap=cmap, discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
    axs[1].set_title(f'GMM_{n}_cluster')
    
    #save as PNG for quick view
    fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\GMM_{n}_cluster_{year}.png')

In [None]:
## Use MeanShift for Clustering:
#from sklearn.cluster import estimate_bandwidth

#bandwidth_est = estimate_bandwidth(scaled_features)
#print(f'estimated bandwidth: {bandwidth_est}')

#define bandwidth for model
#bandwidth = 0.3

# create model
#ms = MeanShift(bandwidth=bandwidth)

# fit the model to the data
#ms.fit(scaled_features)

# Assign cluster labels
#cluster_labels = ms.labels_

# Assign cluster labels to GeoDataFrame
#gdf[f'MeanShift_cluster_{bandwidth}'] = cluster_labels

# add column cluster_mean_shift as a new Band into the RasterCollection composite
#MeanShift_cluster = Band.from_vector(vector_features = gdf,
#                                band_name_src = f'MeanShift_cluster_{bandwidth}',
#                                geo_info = composite['blue'].geo_info,
#                                band_name_dst=f'MeanShift_cluster_{bandwidth}')

#composite.add_band(MeanShift_cluster)

#plot clusters
#fig, axs = plt.subplots(1, 2, figsize = (20, 10), sharey=True)

#composite.plot_multiple_bands(["red", "green", "blue"], ax=axs[0])
#axs[0].set_title(f'RGB {year}')

#composite.plot_band(f'MeanShift_cluster_{bandwidth}', colormap='Accent', discrete_values = False, user_defined_ticks = ticks, ax=axs[1])
#axs[1].set_title(f'MeanShift_cluster with bandwith: {bandwidth}')

#save as PNG for quick view
#fig.savefig(f'S:\MSc_23_TimckeFinn\data\python_outputs\cluster\MS_cluster_{year}_{bandwidth}.png')


In [None]:
# save as GeoTiff
#GMM_cluster.to_rasterio(f'S:\MSc_23_TimckeFinn\data\EOdal\GMM_6_cluster_label_{year}.tif')

composite.to_rasterio(f'S:\MSc_23_TimckeFinn\data\EOdal\landsat_median_composite_cluster_{year}.tif')