This notebook implements training data filtering using kmeans method. The filtered training data will then be used to train a classifier and produce a land cover classification map in 2021.

### load packages

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import rioxarray
from odc.io.cgroups import get_cpu_quota
from datacube.utils.cog import write_cog
from odc.algo import xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from rasterio.enums import Resampling
from random_sampling import random_sampling

### parameters and file paths

In [None]:
# file paths and attributes
training_signature_path='Results/Training_features_Rwanda.txt'

rf2017_path='Results/rwanda_landcover_2015_scheme_ii_classes_merged.tif'

crs='epsg:32735' # WGS84/UTM Zone 35S
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2','NDVI']
class_attr = 'LC_Class_I' # class label in integer format
column_names=[class_attr]
for measurement in measurements:
    for i in range(6):
        column_names.append(measurement+'_'+str(i))

### load training data, tiles and reference map

In [None]:
training_features= pd.read_csv(training_signature_path,sep=' ')
training_features=training_features[column_names] # select attributes
training_features[class_attr]=training_features[class_attr].astype(int)
print('land cover survey points 2017:\n',training_features)

In [None]:
reference_map = xr.open_dataset(rf2017_path,engine="rasterio").astype(np.uint8).squeeze("band", drop=True)
# # reproject the raster
reference_map= reference_map.rio.reproject(resolution=30, dst_crs=crs,resampling=Resampling.nearest)
reference_map=reference_map.band_data
print('Reference land cover classifcation raster:\n',reference_map)

### define queries and feature layer function

In [None]:
query = {
    'time': ('2021-01', '2021-12'),
    'output_crs': crs,
    'resolution': (-10, 10)
}
# define a function to feature layers
def feature_layers(query):
    measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  measurements=measurements,
                  group_by='solar_day',
                  verbose=False,
#                   mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
    # calculate geomedians within each two-month interval
    ds=ds.resample(time='2MS').map(xr_geomedian)
    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    ds_stacked=None
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            if ds_stacked is None:
                ds_stacked=measure_single
            else:
                ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
    return ds_stacked

### Generate random samples - stratified by class

In [None]:
lc_classes=training_features[class_attr].unique() # get class labels
n_class=len(lc_classes)
print('land cover classes:\n',lc_classes)
n_samples=1000 # number of random samples per class

In [None]:
out_fname='Results/Random_samples_Rwanda.geojson'
gpd_random_samples=random_sampling(da=reference_map,n=1000,sampling='stratified_random',
                                   min_sample_n=50,out_fname=out_fname,class_attr=class_attr,drop_value=0)
gpd_random_samples[class_attr]=gpd_random_samples[class_attr].astype(int)
gpd_random_samples=gpd_random_samples.set_crs(reference_map.rio.crs)

### Extract features for the random samples

In [None]:
# detect the number of CPUs
ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))
# collect training data
column_names, model_input = collect_training_data(
    gdf=gpd_random_samples,
    dc_query=query,
    ncpus=ncpus,
    field=class_attr,
    zonal_stats=None,
    feature_func=feature_layers,
    return_coords=False)

In [None]:
rand_samples_features=pd.DataFrame(data=model_input,columns=column_names)
#set the name and location of the output file
output_file = 'Results/Random_samples_features_Rwanda.txt'
#Export files to disk
rand_samples_features.to_csv(output_file, header=True, index=None, sep=' ')

### K-Means clustering

In [None]:
def find_clusters_KMeans(data,min_cluster=5,max_cluster=20):
    highest_score=-999
    n_cluster_optimal=min_cluster
    kmeans_model_optimal=None # initialise optimal model parameters
    labels_optimal=None
    if min_cluster==max_cluster:
        print('Implementing kmeans clustering with number of clusters: ',max_cluster)
        kmeans_model_optimal = KMeans(n_clusters=max_cluster, random_state=1).fit(data)
        labels_optimal=kmeans_model_optimal.predict(data)
        n_cluster_optimal=max_cluster
    else:
        for n_cluster in range(min_cluster,max_cluster+1):
            kmeans_model = KMeans(n_clusters=n_cluster, random_state=1).fit(data)
            labels=kmeans_model.predict(data)
            score=metrics.calinski_harabasz_score(data, labels)
            print('Calinski-Harabasz score for ',n_cluster,' clusters is: ',score)
            if (highest_score==-999)or(highest_score<score):
                highest_score=score
                n_cluster_optimal=n_cluster
                kmeans_model_optimal=kmeans_model
                labels_optimal=labels
        print('Best number of clusters: %s'%(n_cluster_optimal))
    return n_cluster_optimal,kmeans_model_optimal,labels_optimal

In [None]:
training_features_filtered=None # filtered training data for all classes
frequency_threshold=0.1 # threshold of cluter frequency
optimal_clusters={1:None, 5: None, 7: None,  9: None, 10: None, 11: None,  12: None,  13: None,  14: None}
for class_value in lc_classes: # filtering training data for each class
    #i=1 # test for first class
    print('Processing class ',class_value)
    rand_features_single_class=rand_samples_features[rand_samples_features[class_attr]==class_value].reset_index(drop=True)
    np_rand_features=rand_features_single_class.to_numpy()[:,1:]
    if optimal_clusters[class_value] is None:
        n_cluster_optimal,kmeans_model_optimal,labels_optimal=find_clusters_KMeans(np_rand_features,min_cluster=5,max_cluster=20)
    else:
        n_cluster_optimal,kmeans_model_optimal,labels_optimal=find_clusters_KMeans(np_rand_features,min_cluster=optimal_clusters[class_value],max_cluster=optimal_clusters[class_value])

        # subset original training points for this class
    td_single_class=training_features[training_features[class_attr]==class_value].reset_index(drop=True)
    print('Number of training pints for the class: ',len(td_single_class))
    np_td_single_class=td_single_class.to_numpy()[:,-9:-3]
    # predict clustering labels
    labels_kmeans = kmeans_model_optimal.predict(np_td_single_class)
    # append clustering results to pixel coordinates
    td_single_class['cluster']=labels_kmeans

    cluster_frequency=td_single_class['cluster'].map(td_single_class['cluster'].value_counts(normalize=True)) # calculate cluster frequencies for the training samples
    td_single_class['cluster_frequency']=cluster_frequency # append as a column
    td_single_class_filtered=td_single_class[td_single_class['cluster_frequency']>=frequency_threshold] # filter by cluster frequency
    print('Number of training data after filtering: ',len(td_single_class_filtered))
    
    # append the filtered training points of this class to final filtered training data
    if training_features_filtered is None:
        training_features_filtered=td_single_class_filtered
    else:
        training_features_filtered=pd.concat([training_features_filtered, td_single_class_filtered])

### Export filtered training features

In [None]:
output_file = "Results/Training_features_Rwanda_filtered.txt"
training_features_filtered.iloc[:,:-5].to_csv(output_file, header=True, index=None, sep=' ')