This notebook implements FAO's training data filtering method using k-means clustering.

### load packages and get number of cpus

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import rioxarray
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from rasterio.enums import Resampling
from random_sampling import random_sampling # adapted from function by Chad Burton: https://gist.github.com/cbur24/04760d645aa123a3b1817b07786e7d9f

ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

### load input file paths and set parameters

In [None]:
# file paths and attributes
# traning_points_path = 'Data/trainning_samples_FNDS_II_SOM_2016.geojson'
traning_points_path = 'Results/stratified_random_training_points_lulc_2016_balanced.geojson'
# rf2017_path='Data/Landcover_map_ODC_Brazil_2015_2016.tif'
rf2017_path='Data/moz_lulc2016_28082019_final.tif'
tiles_shp='Data/Mozambique_tiles_biggest1.shp'
class_name = 'LC_Class_I' # class label in integer format
crs='epsg:32736' # WGS84/UTM Zone 36S

# Load reference land cover survey points and reproject
training_data2017= gpd.read_file(traning_points_path).to_crs(crs) # read training points as geopandas dataframe
training_data2017=training_data2017[[class_name,'geometry']] # select attributes
print('land cover survey points 2017:\n',training_data2017)

# get bounding boxes of tiles
tiles=gpd.read_file(tiles_shp).to_crs(crs)
tile_bboxes=tiles.bounds
print('tile boundaries for Mozambique: \n',tile_bboxes)

# load initial classification map
rf_2017_raster = xr.open_dataset(rf2017_path,engine="rasterio").astype(np.uint8).squeeze("band", drop=True)
# # reproject the raster
# rf_2017_raster= rf_2017_raster.rio.reproject(resolution=10, dst_crs=crs,resampling=Resampling.nearest)
rf_2017_raster=rf_2017_raster.band_data
print('Reference land cover classifcation raster:\n',rf_2017_raster) # note: 255 is nodata
# get class labels
lc_classes=training_data2017[class_name].unique() 
print('land cover classes:\n',lc_classes)

### define queries and feature layer function

In [None]:
# fill_nan_value=-999 # value to replace nans in query results
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
query = {
    'time': ('2021-01', '2021-12'),
    'measurements': measurements,
    'output_crs': crs,
    'resolution': (-10, 10)
}
# define a function to feature layers
def feature_layers(query): 
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  group_by='solar_day',
                  verbose=False,
#                   mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
    # interpolate nodata using mean of previous and next observation
#     ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False,fill_value='extrapolate')
#     ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False)
    # calculate geomedians within each two-month interval
    ds=ds.resample(time='2MS').map(xr_geomedian)
    # replace nan with a value so that the collect_training_data function will work
#     ds=ds.fillna(fill_nan_value)
    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    ds_stacked=None
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            # print ('Stacking band ',list_measurements[j],' at time ',k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            if ds_stacked is None:
                ds_stacked=measure_single
            else:
                ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
    return ds_stacked

### per-class training data filter using k-means clustering

In [None]:
n_samples=1000 # number of random samples to optimise number of clusters for kmeans
zonal_stats = None
scaler = StandardScaler() # standard scaler for input data standardisation
frequency_threshold=0.05 # threshold of cluter frequency
td2021_filtered=None # filtered training data
# filtering training data for each class
for i in lc_classes:
    #i=1 # test for first class
    print('Processing class ',i)
    gpd_samples=None
    n_total=np.sum(rf_2017_raster.to_numpy()==i)
    # generate randomly sampled data to fit and optimise a kmeans clusterer
    for n in range(len(tile_bboxes)):
        print('stratified random sampling from tile ',n)
        da_mask=rf_2017_raster.rio.clip([tiles.iloc[n].geometry],crs=crs,drop=True)
        da_mask=da_mask.rio.reproject(dst_crs=crs,resampling=Resampling.nearest)
        n_samples_tile=n_samples*np.sum(da_mask.to_numpy()==i)/n_total
        if n_samples_tile>0:
            gpd_samples_tile=random_sampling(da_mask,n_samples_tile,sampling='manual',
                                             manual_class_ratios={str(i):n_samples_tile},out_fname=None)
            if gpd_samples is None:
                gpd_samples=gpd_samples_tile
            else:
                gpd_samples=pd.concat([gpd_samples,gpd_samples_tile])
    # get data array
#     da_mask=da_mask.where(da_mask==i,np.nan) # replace other class values as nan so they won't be sampled (comment due to large memory required)
#     gpd_samples=random_sampling(da_mask,n_samples,sampling='stratified_random',manual_class_ratios=None,out_fname=None)
#     gpd_samples=random_sampling(da_mask,n_samples,sampling='manual',manual_class_ratios={str(i):n_samples},out_fname=None)
    gpd_samples=gpd_samples.reset_index(drop=True).drop(columns=['spatial_ref','class']) # drop this attribute derived from random_sampling function
    gpd_samples[class_name]=i # add attribute field so that we can use collect_training_data function
    if gpd_samples.crs is None:
        gpd_samples=gpd_samples.set_crs(crs)
    print('radomly sampled points for class ',i,'\n',gpd_samples)
    # extract data for the random samples
    column_names, sampled_data = collect_training_data(gdf=gpd_samples,
                                                          dc_query=query,
                                                          ncpus=ncpus,
#                                                           ncpus=1,
                                                          field=class_name, 
                                                          zonal_stats=zonal_stats,
                                                          feature_func=feature_layers,
                                                          return_coords=False)
    # standardise features
    scaler=scaler.fit(sampled_data[:,1:])
    sampled_data=scaler.transform(sampled_data[:,1:])
    # fit kmeans model using the sample training data
    # first find optimal number of clusters based on Calinski-Harabasz index
    highest_score=-999
    n_cluster_optimal=2
    kmeans_model_optimal=None # initialise optimal model parameters
    labels_optimal=None
    for n_cluster in range(2,10):
        kmeans_model = KMeans(n_clusters=n_cluster, random_state=1).fit(sampled_data)
        labels=kmeans_model.predict(sampled_data)
        score=metrics.calinski_harabasz_score(sampled_data, labels)
        print('Calinski-Harabasz score for ',n_cluster,' clusters is: ',score)
        if (highest_score==-999)or(highest_score<score):
            highest_score=score
            n_cluster_optimal=n_cluster
            kmeans_model_optimal=kmeans_model
            labels_optimal=labels
    print('Best number of clusters for class %s: %s'%(i,n_cluster_optimal))
    
    # subset original training points for this class
    td_single_class=training_data2017[training_data2017[class_name]==i].reset_index(drop=True)
    print('Number of training data collected: ',len(td_single_class))
    column_names, model_input = collect_training_data(gdf=td_single_class,
                                                      dc_query=query,
                                                      ncpus=ncpus,
                                                      field=class_name,
                                                      zonal_stats=zonal_stats,
                                                      feature_func=feature_layers,
                                                      clean=True,
                                                      return_coords=True)
    print('Number of training data after removing Nans and Infs: ',model_input.shape[0])
    # first covert the training data to pandas
    td_single_class_filtered=pd.DataFrame(data=model_input,columns=column_names)
    # then to geopandas dataframe
    td_single_class_filtered=gpd.GeoDataFrame(td_single_class_filtered, 
                                    geometry=gpd.points_from_xy(model_input[:,-2], model_input[:,-1],
                                                                crs=crs))
    # normalisation before clustering
    model_input=scaler.transform(model_input[:,1:-2])
    # predict clustering labels
    labels_kmeans = kmeans_model_optimal.predict(model_input)
    # append clustering results to pixel coordinates
    td_single_class_filtered['cluster']=labels_kmeans
    # append frequency of each cluster
#     labels_optimal=pd.DataFrame(data=labels_optimal,columns=['cluster']) # calculate cluster frequencies of the random samples
#     cluster_frequency=td_single_class_filtered['cluster'].map(labels_optimal['cluster'].value_counts(normalize=True))
    cluster_frequency=td_single_class_filtered['cluster'].map(td_single_class_filtered['cluster'].value_counts(normalize=True))
    td_single_class_filtered['cluster_frequency']=cluster_frequency
#     print('filtered training data: \n',td_single_class_filtered[td_single_class_filtered['cluster_frequency']<frequency_threshold])
    # filter by cluster frequency
    td_single_class_filtered=td_single_class_filtered[td_single_class_filtered['cluster_frequency']>=frequency_threshold]
    print('Number of training data after filtering: ',len(td_single_class_filtered))
    # export filtered training data for this class as shapefile (will encounter 10-character limit for attributes)
#     td_single_class_filtered.to_file('Results/landcover_td2021_filtered_DEAfrica_new_class_'+str(i)+'.shp')
    # export filtered training data for this class as geojson file
#     td_single_class_filtered.to_file('Results/landcover_td2021_filtered_class_'+str(i)+'.geojson', driver="GeoJSON")
    td_single_class_filtered.to_file('Results/stratified_random_training_points_lulc_2016_balanced_2021_filtered_class_'+str(i)+'.geojson', driver="GeoJSON")
    # append the filtered training points of this class to final filtered training data
    if td2021_filtered is None:
        td2021_filtered=td_single_class_filtered
    else:
        td2021_filtered=pd.concat([td2021_filtered, td_single_class_filtered])
        

### export filtered training data

In [None]:
# save training data for all classes
print('filtered training data for 2021:\n',td2021_filtered)
# td2021_filtered.to_file('Results/landcover_td2021_filtered.geojson', driver="GeoJSON")
td2021_filtered.to_file('Results/stratified_random_training_points_lulc_2016_balanced_2021_filtered.geojson', driver="GeoJSON")

# export the filtered training data as txt file
# output_file = "Results/landcover_td2021_filtered.txt"
output_file = "Results/stratified_random_training_points_lulc_2016_balanced_2021_filtered.txt"
td2021_filtered.to_csv(output_file, header=True, index=None, sep=' ')