This notebook implements FAO's training data filtering method using k-means clustering.

### load packages

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import rioxarray
from odc.io.cgroups import get_cpu_quota
from datacube.utils.cog import write_cog
from odc.algo import xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from rasterio.enums import Resampling
from random_sampling import random_sampling

### parameters and file paths

In [None]:
# file paths and attributes
training_signature_path='Results/Mozambique_training_features.txt'

rf2017_path='Results/moz_lulc2016_28082019_final_remapped.tif'

# tile shapefile to stratify the random sampling to fit into memory
tiles_shp='Data/Mozambique_tiles_biggest1.shp'

output_crs='epsg:32736' # WGS84/UTM Zone 36S
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2','NDVI']
class_name = 'LC_Class_I' # class label in integer format
column_names=[class_name]
for measurement in measurements:
    for i in range(1,12,2):
        column_names.append(measurement+'_'+str(i))

### load training data, tiles and reference map

In [None]:
training_data2017= pd.read_csv(training_signature_path,sep=' ')
training_data2017=training_data2017[column_names] # select attributes
training_data2017[class_name]=training_data2017[class_name].astype(int)
print('land cover survey points 2017:\n',training_data2017)

In [None]:
tiles=gpd.read_file(tiles_shp).to_crs(output_crs)
# get bounding boxes of tiles
tile_bboxes=tiles.bounds
print('tile boundaries for Mozambique: \n',tile_bboxes)

In [None]:
rf_2017_raster = xr.open_dataset(rf2017_path,engine="rasterio").astype(np.uint8).squeeze("band", drop=True)
# # reproject the raster
rf_2017_raster= rf_2017_raster.rio.reproject(resolution=30, dst_crs=output_crs,resampling=Resampling.nearest)
rf_2017_raster=rf_2017_raster.band_data
print('Reference land cover classifcation raster:\n',rf_2017_raster)

### define queries and feature layer function

In [None]:
gm_s2_available=False

measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
resolution = (-10,10)

if gm_s2_available:
    query = {
        'time': ('2021-01', '2021-12'),
        'measurements': measurements,
        'output_crs': output_crs,
        'resolution': resolution
    }
    # define a function to feature layers
    def feature_layers(query): 
        # connect to the datacube so we can access DE Africa data
        dc = datacube.Datacube(app='rolling geomedians')
        # load rolling geomedians
        ds = dc.load(product='gm_s2_rolling',measurements=measurements,
                     group_by='solar_day',**query)
        ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
        n_time=ds.dims['time'] # 12
        list_measurements=list(ds.keys())
        ds_stacked=None
        for j in range(len(list_measurements)):
            for k in range(1,n_time,2): # extract the six months 2021-01, 2021-03, 2021-05,... 2021-11
                variable_name=list_measurements[j]+'_'+str(k)
                measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
                if ds_stacked is None:
                    ds_stacked=measure_single
                else:
                    ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
        return ds_stacked
else:
    query = {
        'time': ('2020-12', '2021-12'),
        'measurements': measurements,
        'output_crs': output_crs,
        'resolution': resolution
    }
    # define a function to feature layers
    def feature_layers(query):
        dc = datacube.Datacube(app='rolling geomedians')
        # load Sentinel-2 analysis ready data
        ds = load_ard(dc=dc,
                      products=['s2_l2a'],
                      group_by='solar_day',
                      verbose=False,
                      **query)
        ds = calculate_indices(ds,
                               index=['NDVI'],
                               drop=False,
                               satellite_mission='s2')
        # calculate rolling geomedians
        time_slices=[('2020-12','2021-02'),('2021-02','2021-04'),('2021-04','2021-06'),
                     ('2021-06','2021-08'),('2021-08','2021-10'),('2021-10','2021-12')]
        ds_rolling=None
        for i in range(len(time_slices)):
            ds_single=xr_geomedian(ds.sel(time=slice(time_slices[i][0],time_slices[i][1]))).assign_coords({'time':time_slices[i][0]})
            if ds_rolling is None:
                ds_rolling=ds_single
            else:
                ds_rolling=xr.concat([ds_rolling,ds_single],dim='time')
        # stackmulti-temporal measurements and rename them
        n_time=ds_rolling.dims['time']
        list_measurements=list(ds_rolling.keys())
        list_stack_measures=[]
        for j in range(len(list_measurements)):
            for k in range(n_time):
#                 variable_name=list_measurements[j]+'_'+str(k)
                variable_name=list_measurements[j]+'_'+str(2*k+1) # to keep consistent with above case
                measure_single=ds_rolling[list_measurements[j]].isel(time=k).rename(variable_name)
                list_stack_measures.append(measure_single)
        ds_stacked=xr.merge(list_stack_measures,compat='override')
        return ds_stacked

### Generate random samples - stratified by class and generated per tile

In [None]:
lc_classes=training_data2017[class_name].unique() # get class labels
n_class=len(lc_classes)
print('land cover classes:\n',lc_classes)
n_tile=len(tile_bboxes)
n_samples=1000 # number of random samples per class

In [None]:
path_gpd_samples='Results/stratified_random_samples.geojson'
if os.path.exists(path_gpd_samples):
    gpd_samples=gpd.read_file(path_gpd_samples)
else:
    # calculate number of pixels belong to each class for each tile
    n_class_tiles=np.zeros((n_class,n_tile))
    for j in range(n_tile):
        print('calculating stasts for tile ',j)
        tile_raster=rf_2017_raster.rio.clip([tiles.iloc[j].geometry],crs=output_crs,drop=True,from_disk=True)
        for i in range(n_class):
            n_class_tiles[i,j]=np.sum(tile_raster.to_numpy()==lc_classes[i])
    n_class_total=np.sum(n_class_tiles,axis=1) # total number of pixels belong to each class
    del tile_raster

    # extract random samples from each tile
    gpd_samples=None
    for j in range(n_tile):
        print('Extracting random samples for tile ',j)
        tile_raster = rf_2017_raster.rio.clip([tiles.iloc[j].geometry],crs=output_crs,drop=True,from_disk=True)
        n_samples_tile=[int((n_samples*n_class_tiles[i,j])/n_class_total[i]) for i in range(n_class)] # number of samples per class
        n_samples_tile_valid={str(lc_classes[i]):n_samples_tile[i] for i in range(n_class) if n_samples_tile[i]>0} # remove class absent on this tile
        gpd_samples_tile=random_sampling(tile_raster,n_samples_tile,sampling='manual',
                                         manual_class_ratios=n_samples_tile_valid,
                                         out_fname=None)
        if gpd_samples is None:
            gpd_samples=gpd_samples_tile
        else:
            gpd_samples=pd.concat([gpd_samples,gpd_samples_tile])
    del tile_raster

    gpd_samples[class_name]=gpd_samples['class'].astype(int) # add class attribute field
    gpd_samples=gpd_samples.reset_index(drop=True).drop(columns=['spatial_ref','class']) # drop attribute derived from random_sampling function
    if gpd_samples.crs is None:
        gpd_samples=gpd_samples.set_crs(output_crs)
    # export to disk
    gpd_samples.to_file(path_gpd_samples, driver="GeoJSON")
gpd_samples=gpd_samples[[class_name,'geometry']]
print('radomly sampled points for all classes: \n',gpd_samples)

### Extract features for the random samples

In [None]:
output_file = 'Results/stratified_random_samples_features.txt'
if os.path.exists(output_file):
    rand_samples_features=pd.read_csv(output_file,sep=' ')
else:
    ncpus=round(get_cpu_quota())
    print('ncpus = '+str(ncpus))
    # extract data for the random samples
    column_names, model_input = collect_training_data(gdf=gpd_samples,
                                                        dc_query=query,
                                                        ncpus=ncpus,
                                                        field=class_name, 
                                                        zonal_stats=None,
                                                        feature_func=feature_layers,
                                                        return_coords=False)
    rand_samples_features=pd.DataFrame(data=model_input,columns=column_names)
    #Export files to disk
    rand_samples_features.to_csv(output_file, header=True, index=None, sep=' ')

### kmeans clustering and filtering

In [None]:
def find_clusters_KMeans(data,min_cluster=5,max_cluster=20):
    highest_score=-999
    n_cluster_optimal=min_cluster
    kmeans_model_optimal=None # initialise optimal model parameters
    labels_optimal=None
    if min_cluster==max_cluster:
        print('Implementing kmeans clustering with number of clusters: ',max_cluster)
        kmeans_model_optimal = KMeans(n_clusters=max_cluster, random_state=1).fit(data)
        labels_optimal=kmeans_model_optimal.predict(data)
        n_cluster_optimal=max_cluster
    else:
        for n_cluster in range(min_cluster,max_cluster+1):
            kmeans_model = KMeans(n_clusters=n_cluster, random_state=1).fit(data)
            labels=kmeans_model.predict(data)
            score=metrics.calinski_harabasz_score(data, labels)
            print('Calinski-Harabasz score for ',n_cluster,' clusters is: ',score)
            if (highest_score==-999)or(highest_score<score):
                highest_score=score
                n_cluster_optimal=n_cluster
                kmeans_model_optimal=kmeans_model
                labels_optimal=labels
        print('Best number of clusters: %s'%(n_cluster_optimal))
    return n_cluster_optimal,kmeans_model_optimal,labels_optimal

In [None]:
scaler = StandardScaler() # standard scaler for input data standardisation
frequency_threshold=0.1 # threshold of cluter frequency
optimal_clusters={12:None, 61:None, 41:None, 72:None, 74:None, 31:None, 51:None, 44:None,
                  75:None, 21:None, 71:None, 70:None, 11:None}
training_features_filtered=None 
for class_value in lc_classes:
    print('Processing class ',class_value)
    rand_features_single_class=rand_samples_features[rand_samples_features[class_name]==class_value].reset_index(drop=True)
    np_rand_features=rand_features_single_class.to_numpy()[:,1:]
    # standardise features
    scaler.fit(np_rand_features)
    np_rand_features=scaler.transform(np_rand_features)
    
    if optimal_clusters[class_value] is None:
        n_cluster_optimal,kmeans_model_optimal,labels_optimal=find_clusters_KMeans(np_rand_features,min_cluster=5,max_cluster=20)
    else:
        n_cluster_optimal,kmeans_model_optimal,labels_optimal=find_clusters_KMeans(np_rand_features,min_cluster=optimal_clusters[class_value],max_cluster=optimal_clusters[class_value])

        # subset original training points for this class
    td_single_class=training_data2017[training_data2017[class_name]==class_value].reset_index(drop=True)
    print('Number of training pints for the class: ',len(td_single_class))
    np_td_single_class=td_single_class.to_numpy()[:,1:]
    # standardise features
    np_td_single_class=scaler.transform(np_td_single_class)
    # predict clustering labels
    labels_kmeans = kmeans_model_optimal.predict(np_td_single_class)
    # append clustering results to pixel coordinates
    td_single_class['cluster']=labels_kmeans

    cluster_frequency=td_single_class['cluster'].map(td_single_class['cluster'].value_counts(normalize=True)) # calculate cluster frequencies for the training samples
    td_single_class['cluster_frequency']=cluster_frequency # append as a column
    td_single_class_filtered=td_single_class[td_single_class['cluster_frequency']>=frequency_threshold] # filter by cluster frequency
    print('Number of training data after filtering: ',len(td_single_class_filtered))
    
    # append the filtered training points of this class to final filtered training data
    if training_features_filtered is None:
        training_features_filtered=td_single_class_filtered
    else:
        training_features_filtered=pd.concat([training_features_filtered, td_single_class_filtered])

### export filtered training data

In [None]:
print('filtered training data for 2021:\n',training_features_filtered)
output_file = "Results/Mozambique_training_features_filtered.txt"
training_features_filtered.to_csv(output_file, header=True, index=None, sep=' ')