This notebook implements extraction and filtering of training data on a per-class basis using a reference land cover map and k-means clustering. It incudes the following steps:
1. Generate a set of randomly distributed samples for each class using the reference land cover map
2. Extract features of the random samples
3. Fit k-means clustering models using the features of the random samples
4. Extract features for training points of each class
5. Predict clustering labels of the training points
6. Filter training points based on cluster frequency
7. Merge filtered points for all classes and export filtered data 

The filtered training data will then be used to train a classifier and produce a land cover classification map in 2021.

### load packages and get number of cpus

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from rasterio.enums import Resampling
from random_sampling import random_sampling # adapted from function by Chad Burton: https://gist.github.com/cbur24/04760d645aa123a3b1817b07786e7d9f

# get number of cpus
ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

### input files and parameters

In [None]:
# file paths and attributes
traning_points_path = 'Results/manual_number_random_training_points_scheme_ii_2015.geojson' # training points extracted from reference map
ref_map_path='Results/rwanda_landcover_2015_scheme_ii_classes_merged.tif' # reference land cover map (class merged)
class_name = 'LC_Class_I' # class label in integer format
output_crs='epsg:32735' # WGS84/UTM Zone 35S

# Load training points and reproject
training_data_unfiltered= gpd.read_file(traning_points_path).to_crs(output_crs) # read training points as geopandas dataframe
training_data_unfiltered=training_data_unfiltered[[class_name,'geometry']] # select attributes
training_data_unfiltered[class_name]=training_data_unfiltered[class_name].astype(int)
print('land cover training points (unfiltered):\n',training_data_unfiltered)

# further merge classes if needed
# training_data_unfiltered.loc[training_data_unfiltered[class_name]==2,class_name]=1 # Open Forest (2) merged with Dense Forest (1) as Forest (1)
# training_data_unfiltered.loc[training_data_unfiltered[class_name]==8,class_name]=6 # Wooded Grassland (8) merged with Open Grassland (6) as Grassland (6)

# load reference classification map
ref_map_raster = xr.open_dataset(ref_map_path,engine="rasterio").astype(np.uint8).squeeze("band", drop=True)
# reproject the raster
ref_map_raster= ref_map_raster.rio.reproject(resolution=10, dst_crs=output_crs,resampling=Resampling.nearest)
print('Initial random forest classifcation raster:\n',ref_map_raster) # note: 255 is nodata

### define query and feature layer function

In [None]:
# define ODC query
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
query = {
    'time': ('2021-01', '2021-12'),
    'measurements': measurements,
    'output_crs': output_crs,
    'resolution': (-10, 10)
}
# define a function to feature layers
def feature_layers(query): 
    
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  group_by='solar_day',
                  verbose=False,
#                   mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)
    # calculate NDVI
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
    
    # calculate geomedians within each two-month interval
    ds=ds.resample(time='2MS').map(xr_geomedian)
    
    # interpolate nodata using mean of previous and next observation
#     ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False,fill_value='extrapolate')

    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    ds_stacked=None
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            # print ('Stacking band ',list_measurements[j],' at time ',k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            if ds_stacked is None:
                ds_stacked=measure_single
            else:
                ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
    return ds_stacked

### per-class training data filter using k-means clustering

In [None]:
# parameters and variables for training data filtering
lc_classes=training_data_unfiltered[class_name].unique() # class labels
n_samples=1000 # number of random samples per class to train kmeans clusterer
zonal_stats = None
scaler = StandardScaler() # standard scaler for input data standardisation
frequency_threshold=0.1 # threshold of cluter frequency
td2021_filtered=None # initialise filtered training data

# filtering training data for each class
for i in lc_classes:
    print('Processing class ',i)
    
    # generate randomly distributed samples to train and optimise a kmeans clusterer
    da_mask=ref_map_raster.band_data # get data array
    da_mask=da_mask.where(da_mask==i,np.nan) # replace other class values as nan so they won't be sampled
    gpd_samples=random_sampling(da_mask,n_samples,sampling='stratified_random',manual_class_ratios=None,out_fname=None)
    gpd_samples=gpd_samples.drop(columns=['spatial_ref','class']) # drop this attribute derived from random_sampling function
    gpd_samples[class_name]=int(i) # add attribute field so that we can use collect_training_data function
    if gpd_samples.crs is None: # set crs if it somehow is lost
        gpd_samples=gpd_samples.set_crs(output_crs)
    print('radomly sampled points for class ',i,'\n',gpd_samples)
    
    # extract features for the samples
    column_names, sampled_data = collect_training_data(gdf=gpd_samples,
                                                          dc_query=query,
                                                          ncpus=30,
#                                                           ncpus=1,
                                                          field=class_name, 
                                                          zonal_stats=zonal_stats,
                                                          feature_func=feature_layers,
                                                            clean=True,
                                                          return_coords=False)
    # normalise features before clustering
    scaler=scaler.fit(sampled_data[:,1:])
    sampled_data=scaler.transform(sampled_data[:,1:])

    # fit kmeans model using the sample training data
    # first find optimal number of clusters based on Calinski-Harabasz index
    highest_score=-999 # initialise score
    n_cluster_optimal=2 # initialise number of clusters
    kmeans_model_optimal=None # initialise optimal model
    labels_optimal=None # initialse optimal clustering results
    for n_cluster in range(2,10): # loop to find optimal clusterer
        kmeans_model = KMeans(n_clusters=n_cluster, random_state=1).fit(sampled_data)
        labels=kmeans_model.predict(sampled_data)
        score=metrics.calinski_harabasz_score(sampled_data, labels)
        print('Calinski-Harabasz score for ',n_cluster,' clusters is: ',score)
        if (highest_score==-999)or(highest_score<score):
            highest_score=score
            n_cluster_optimal=n_cluster
            kmeans_model_optimal=kmeans_model
            labels_optimal=labels
    print('Best number of clusters for class %s: %s'%(i,n_cluster_optimal))
    
    # extract features for training points of each class
    td_single_class=training_data_unfiltered[training_data_unfiltered[class_name]==i].reset_index(drop=True) # identify training points of the
    print('Number of training data collected: ',len(td_single_class))
    column_names, model_input = collect_training_data(gdf=td_single_class,
                                                      dc_query=query,
                                                      ncpus=30, # change here depending on your sandbox instance
                                                      field=class_name,
                                                      zonal_stats=zonal_stats,
                                                      feature_func=feature_layers,
                                                      clean=True,
                                                      return_coords=True) # extract features of training points
    print('Number of training points after removing Nans and Infs: ',model_input.shape[0])

    # first covert the training data to pandas
    td_single_class_filtered=pd.DataFrame(data=model_input,columns=column_names)

    # then to geopandas dataframe
    td_single_class_filtered=gpd.GeoDataFrame(td_single_class_filtered, 
                                    geometry=gpd.points_from_xy(model_input[:,-2], model_input[:,-1],
                                                                crs=output_crs))
    # normalise features before clustering
    model_input=scaler.transform(model_input[:,1:-2])

    # predict clustering labels
    labels_kmeans = kmeans_model_optimal.predict(model_input)

    # append clustering labels to training dataframe
    td_single_class_filtered['cluster']=labels_kmeans

    # calculate and append cluster frequency of each cluster
#     labels_optimal=pd.DataFrame(data=labels_optimal,columns=['cluster']) # calculate cluster frequencies of the random samples
#     cluster_frequency=td_single_class_filtered['cluster'].map(labels_optimal['cluster'].value_counts(normalize=True))
    cluster_frequency=td_single_class_filtered['cluster'].map(td_single_class_filtered['cluster'].value_counts(normalize=True))
    td_single_class_filtered['cluster_frequency']=cluster_frequency

    # filter by cluster frequency
    td_single_class_filtered=td_single_class_filtered[td_single_class_filtered['cluster_frequency']>=frequency_threshold]
    print('Number of training data after filtering: ',len(td_single_class_filtered))

    # export filtered training data for this class as geojson file
#     td_single_class_filtered.to_file('Results/stratified_random_training_points_scheme_ii_2021_filtered_class_'+str(i)+'.geojson', driver="GeoJSON")
    td_single_class_filtered.to_file('Results/manual_random_training_points_scheme_ii_2021_filtered_class_'+str(i)+'.geojson', driver="GeoJSON")
    # append the filtered training points of this class to final filtered training data
    if td2021_filtered is None:
        td2021_filtered=td_single_class_filtered
    else:
        td2021_filtered=pd.concat([td2021_filtered, td_single_class_filtered])

### export filtered training data

In [None]:
# save filtered training data for all classes as geojson file
print('filtered training data for 2021:\n',td2021_filtered)
# td2021_filtered.to_file('Results/stratified_random_training_points_scheme_ii_2021_filtered.geojson', driver="GeoJSON")
td2021_filtered.to_file('Results/manual_random_training_points_scheme_ii_2021_filtered.geojson', driver="GeoJSON")

# save filtered training data as txt file
# output_file = "Results/stratified_random_training_points_scheme_ii_2021_filtered.txt"
output_file = "Results/manual_random_training_points_scheme_ii_2021_filtered.txt"
td2021_filtered.to_csv(output_file, header=True, index=None, sep=' ')