This notebook filters training data on a per-class basis using a reference/baseline land cover map based on k-means clustering. It incudes steps:
1. Generate a set of randomly distributed samples for a class using the reference/baseline land cover map
2. Extract features of the random samples
3. Fit a kmeans clustering model
4. Predict clustering labels of the training data
5. Filter training points based on cluster frequency
6. Merge filtered points for all classes and export filtered data 

The filtered training data will then be used to train a classifier and produce a land cover classification map in 2021.

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data
from sklearn.cluster import KMeans
from sklearn import metrics
from rasterio.enums import Resampling
from random_sampling import random_sampling # adapted from function by Chad Burton: https://gist.github.com/cbur24/04760d645aa123a3b1817b07786e7d9f

# get number of cpus
ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

# file paths and attributes
rf2021_path='Results/landcover2021baseline.tif' # baseline classification map
training_data2021='Results/landcover_training_data_2021_GEE.txt' # training data 2021
class_name = 'LC_Class_I' # class label in integer format
output_crs='epsg:32735' # output crs: WGS84/UTM Zone 35S

# load baseline classification map
rf_2021_raster = xr.open_dataset(rf2021_path,engine="rasterio").astype(np.uint8).squeeze("band", drop=True)
# reproject the raster
rf_2021_raster= rf_2021_raster.rio.reproject(resolution=10, dst_crs=output_crs,resampling=Resampling.nearest)
print('baseline classifcation raster:\n',rf_2021_raster)

# load training data as geodataframe
gdf_training_data2021= gpd.read_file(training_data2021,delimiter=' ').to_crs(output_crs)

In [5]:
# define ODC query
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
query = {
    'time': ('2021-01', '2021-12'),
    'measurements': measurements,
    'output_crs': output_crs,
    'resolution': (-10, 10)
}

# define a function to feature layers
def feature_layers(query): 
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    # query bands
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  group_by='solar_day',
                  verbose=False,
                  **query)
    # calculate NDVI
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
    # scale NDVI
    ds['NDVI']=ds['NDVI']*10000
    # calculate geomedians within each two-month interval
    ds=ds.resample(time='2MS').map(xr_geomedian)
    # interpolate nodata using mean of previous and next observation
    ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False,fill_value='extrapolate').astype(np.int16)
    # replace nan with a value so that the collect_training_data function will work
#     ds=ds.fillna(fill_nan_value)
    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    list_stack_measures=[]
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            # print ('Stacking band ',list_measurements[j],' at time ',k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            list_stack_measures.append(measure_single)
    ds_stacked=xr.merge(list_stack_measures,compat='override')
    return ds_stacked

In [6]:
# parameters and variables for training data filtering
lc_classes=gdf_training_data2021[class_name].unique() # get class labels
n_samples=1000 # number of random samples to optimise number of clusters for kmeans
zonal_stats = None
td2021_filtered=None # initialise filtered training data
frequency_threshold=0.05 # threshold of cluter frequency

# filtering training data for each class
for i in lc_classes:
    print('Processing class ',i)
    
    # generate spatially randomly distributed samples
    da_mask=rf_2021_raster.band_data # get data array
    da_mask=da_mask.where(da_mask==i,np.nan) # replace other class values as nan so they won't be sampled
    gpd_samples=random_sampling(da_mask,n_samples,sampling='stratified_random',manual_class_ratios=None,out_fname=None) # stratified random sampling
    gpd_samples=gpd_samples.drop(columns=['spatial_ref','class']) # drop the unused attributes derived from random_sampling function
    gpd_samples[class_name]=i.astype(int) # add class attribute field so that we can use collect_training_data function
    if gpd_samples.crs is None:
        gpd_samples=gpd_samples.set_crs(output_crs) # set crs if it somehow is lost
    print('radomly sampled points for class ',i,'\n',gpd_samples)
    
    # extract data for the random samples
    column_names, sampled_data = collect_training_data(gdf=gpd_samples,
                                                          dc_query=query,
                                                          ncpus=30,
#                                                           ncpus=1,
                                                          field=class_name, 
                                                          zonal_stats=zonal_stats,
                                                          feature_func=feature_layers,
                                                          return_coords=False)
    
    # fit and find optimal kmeans model using the sample training data
    # first find optimal number of clusters which corresponds to maximum Calinski-Harabasz score
    sampled_data=sampled_data[:,1:] # exclude the class column which won't be used for clustering
    highest_score=-999 # initialise score
    n_cluster_optimal=2 # initialise number of clusters
    kmeans_model_optimal=None # initialise optimal model
    for n_cluster in range(2,6): # loop to find optimal clusterer
        kmeans_model = KMeans(n_clusters=n_cluster, random_state=1).fit(sampled_data)
        labels=kmeans_model.predict(sampled_data)
        score=metrics.calinski_harabasz_score(sampled_data, labels)
        print('Calinski-Harabasz score for ',n_cluster,' clusters is: ',score)
        if (highest_score==-999)or(highest_score<score):
            highest_score=score
            n_cluster_optimal=n_cluster
            kmeans_model_optimal=kmeans_model
    print('Best number of clusters for class %s: %s'%(i,n_cluster_optimal))
    
    # filter out points with class attribute inconsistent with baseline/reference land cover map
    td_single_class_filtered=gdf_training_data2021[gdf_training_data2021[class_name]==i] # subset training points assigned to this class
    reference_class=da_mask.sel(x=td_single_class_filtered.centroid.x, 
                                y=td_single_class_filtered.centroid.y, 
                                method="nearest").squeeze() # extract the data on the map based on closest coordinates
    td_single_class_filtered['ref_class']=np.diagonal(reference_class) # append class attribute
    td_single_class_filtered=td_single_class_filtered[td_single_class_filtered['ref_class']==i] # filter out points of different class than reference map
    print('Number of training data after filtering using reference map: ',len(td_single_class_filtered))
    
    # predict clusters of the training points using the optimal clusterer
    input_data=td_single_class_filtered[column_names] # exclude unused attributes/columns 
    labels_kmeans = kmeans_model_optimal.predict(input_data.to_numpy()[:,1:]) # exclude the class column which won't be used for clustering
    td_single_class_filtered['cluster']=labels_kmeans # append clustering results
    
    # filter minor clusters by cluster frequency
    cluster_frequency=td_single_class_filtered['cluster'].map(td_single_class_filtered['cluster'].value_counts(normalize=True)) # calculate cluster frequencies
    td_single_class_filtered['cluster_frequency']=cluster_frequency # append cluster frequency attribute
    td_single_class_filtered=td_single_class_filtered[td_single_class_filtered['cluster_frequency']>=frequency_threshold] # remove points with cluster frequency below treshold
    print('Number of training data after filtering based on frequency: ',len(td_single_class_filtered))
    
    # export filtered training data for this class as geojson file
    td_single_class_filtered.to_file('Results/landcover_td2021_filtered_class_'+str(i)+'.geojson', driver="GeoJSON")
    
    # append the filtered training points of this class to final filtered training data (all classes)
    if td2021_filtered is None:
        td2021_filtered=td_single_class_filtered
    else:
        td2021_filtered=pd.concat([td2021_filtered, td_single_class_filtered])
        
# save filtered training data for all classes
print('filtered training data:\n',td2021_filtered)
td2021_filtered.to_file('Results/landcover_td2021_filtered.geojson', driver="GeoJSON")

# export the filtered training data for all classes as txt file
output_file = "Results/landcover_td2021_filtered.txt"
td2021_filtered.to_csv(output_file, header=True, index=None, sep=' ')

Processing class  1.0
Class 1.0: sampling at 1000 coordinates
radomly sampled points for class  1.0 
                            geometry  LC_Class_I
0    POINT (562566.582 6746136.183)           1
1    POINT (600666.582 6661816.183)           1
2    POINT (542386.582 6737866.183)           1
3    POINT (542716.582 6667676.183)           1
4    POINT (682666.582 6677616.183)           1
..                              ...         ...
995  POINT (545446.582 6753356.183)           1
996  POINT (531116.582 6719206.183)           1
997  POINT (551496.582 6744056.183)           1
998  POINT (532696.582 6683606.183)           1
999  POINT (592076.582 6786006.183)           1

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  449.6185325600296
Calinski-Harabasz score for  3  clusters is:  420.3190018740609
Calinski-Harabasz score for  4  clusters is:  349.7360664604567
Calinski-Harabasz score for  5  clusters is:  305.96864591571796
Best number of clusters for class 1.0: 2
Number of training data after filtering using reference map:  54
Number of training data after filtering based on frequency:  54
Processing class  7.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Class 7.0: sampling at 1000 coordinates
radomly sampled points for class  7.0 
                            geometry  LC_Class_I
0    POINT (614106.582 6699396.183)           7
1    POINT (680306.582 6727516.183)           7
2    POINT (647836.582 6771976.183)           7
3    POINT (546476.582 6727066.183)           7
4    POINT (585756.582 6733776.183)           7
..                              ...         ...
995  POINT (631296.582 6799836.183)           7
996  POINT (594576.582 6795626.183)           7
997  POINT (630136.582 6822906.183)           7
998  POINT (628476.582 6760536.183)           7
999  POINT (593286.582 6715866.183)           7

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  274.680044033439
Calinski-Harabasz score for  3  clusters is:  250.71896485677942
Calinski-Harabasz score for  4  clusters is:  211.07946627898912
Calinski-Harabasz score for  5  clusters is:  183.65239253645424
Best number of clusters for class 7.0: 2
Number of training data after filtering using reference map:  389
Number of training data after filtering based on frequency:  389


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Processing class  2.0
Class 2.0: sampling at 1000 coordinates
radomly sampled points for class  2.0 
                            geometry  LC_Class_I
0    POINT (531726.582 6665546.183)           2
1    POINT (597496.582 6783976.183)           2
2    POINT (537446.582 6736946.183)           2
3    POINT (554116.582 6636376.183)           2
4    POINT (596906.582 6661576.183)           2
..                              ...         ...
995  POINT (542806.582 6731776.183)           2
996  POINT (551376.582 6713266.183)           2
997  POINT (597226.582 6771006.183)           2
998  POINT (532476.582 6688986.183)           2
999  POINT (613656.582 6785696.183)           2

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  874.6949797696825
Calinski-Harabasz score for  3  clusters is:  685.6921038788735
Calinski-Harabasz score for  4  clusters is:  559.2286652606849
Calinski-Harabasz score for  5  clusters is:  488.7561467763753
Best number of clusters for class 2.0: 2
Number of training data after filtering using reference map:  492
Number of training data after filtering based on frequency:  492


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Processing class  10.0
Class 10.0: sampling at 1000 coordinates
radomly sampled points for class  10.0 
                            geometry  LC_Class_I
0    POINT (694076.582 6757586.183)          10
1    POINT (689486.582 6677936.183)          10
2    POINT (593616.582 6693516.183)          10
3    POINT (653406.582 6698546.183)          10
4    POINT (590396.582 6621456.183)          10
..                              ...         ...
995  POINT (686266.582 6790206.183)          10
996  POINT (638046.582 6773816.183)          10
997  POINT (594006.582 6629026.183)          10
998  POINT (694576.582 6731036.183)          10
999  POINT (566756.582 6769716.183)          10

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  419.969063439913
Calinski-Harabasz score for  3  clusters is:  450.62158189540736
Calinski-Harabasz score for  4  clusters is:  383.0545854692194
Calinski-Harabasz score for  5  clusters is:  352.2681206161454
Best number of clusters for class 10.0: 3
Number of training data after filtering using reference map:  1011
Number of training data after filtering based on frequency:  1011


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Processing class  9.0
Class 9.0: sampling at 1000 coordinates
radomly sampled points for class  9.0 
                            geometry  LC_Class_I
0    POINT (574696.582 6756976.183)           9
1    POINT (571626.582 6688316.183)           9
2    POINT (568146.582 6713006.183)           9
3    POINT (616726.582 6743026.183)           9
4    POINT (614416.582 6785966.183)           9
..                              ...         ...
995  POINT (626226.582 6767996.183)           9
996  POINT (593006.582 6760236.183)           9
997  POINT (608536.582 6688306.183)           9
998  POINT (548166.582 6699056.183)           9
999  POINT (659296.582 6788726.183)           9

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  486.15135016391827
Calinski-Harabasz score for  3  clusters is:  438.13766977054496
Calinski-Harabasz score for  4  clusters is:  356.08493395382544
Calinski-Harabasz score for  5  clusters is:  311.84972809475187
Best number of clusters for class 9.0: 2
Number of training data after filtering using reference map:  345
Number of training data after filtering based on frequency:  345


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Processing class  12.0
Class 12.0: sampling at 1000 coordinates
radomly sampled points for class  12.0 
                            geometry  LC_Class_I
0    POINT (624126.582 6805446.183)          12
1    POINT (524506.582 6716326.183)          12
2    POINT (714916.582 6776046.183)          12
3    POINT (563776.582 6776426.183)          12
4    POINT (606146.582 6795096.183)          12
..                              ...         ...
995  POINT (539126.582 6654876.183)          12
996  POINT (572506.582 6769886.183)          12
997  POINT (544306.582 6752856.183)          12
998  POINT (622596.582 6684546.183)          12
999  POINT (634566.582 6683426.183)          12

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

  data = nangeomedian_pcm(xx_data, **kw)


Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  872.6168801501209
Calinski-Harabasz score for  3  clusters is:  929.1007756690589
Calinski-Harabasz score for  4  clusters is:  801.7783376259185
Calinski-Harabasz score for  5  clusters is:  704.0620664263389
Best number of clusters for class 12.0: 3
Number of training data after filtering using reference map:  98
Number of training data after filtering based on frequency:  98
Processing class  6.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Class 6.0: sampling at 1000 coordinates
radomly sampled points for class  6.0 
                            geometry  LC_Class_I
0    POINT (649406.582 6786386.183)           6
1    POINT (642086.582 6762106.183)           6
2    POINT (556296.582 6640796.183)           6
3    POINT (658276.582 6676146.183)           6
4    POINT (560126.582 6650746.183)           6
..                              ...         ...
995  POINT (645036.582 6757196.183)           6
996  POINT (572466.582 6649876.183)           6
997  POINT (606426.582 6743466.183)           6
998  POINT (603186.582 6744836.183)           6
999  POINT (540556.582 6751206.183)           6

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  1275.7316621291757
Calinski-Harabasz score for  3  clusters is:  1267.8402522840008
Calinski-Harabasz score for  4  clusters is:  1071.2714567823873
Calinski-Harabasz score for  5  clusters is:  901.998263234913
Best number of clusters for class 6.0: 2
Number of training data after filtering using reference map:  165
Number of training data after filtering based on frequency:  165


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Processing class  4.0
Class 4.0: sampling at 1000 coordinates
radomly sampled points for class  4.0 
                            geometry  LC_Class_I
0    POINT (566666.582 6764666.183)           4
1    POINT (628286.582 6824436.183)           4
2    POINT (603016.582 6800586.183)           4
3    POINT (563846.582 6645256.183)           4
4    POINT (559006.582 6716366.183)           4
..                              ...         ...
995  POINT (635096.582 6800906.183)           4
996  POINT (641506.582 6811946.183)           4
997  POINT (591856.582 6767676.183)           4
998  POINT (581426.582 6624816.183)           4
999  POINT (611886.582 6735756.183)           4

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  327.0477412489597
Calinski-Harabasz score for  3  clusters is:  265.1471117570227
Calinski-Harabasz score for  4  clusters is:  226.4567437749835
Calinski-Harabasz score for  5  clusters is:  201.22782385523516
Best number of clusters for class 4.0: 2
Number of training data after filtering using reference map:  566
Number of training data after filtering based on frequency:  566


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Processing class  14.0
Class 14.0: sampling at 1000 coordinates
radomly sampled points for class  14.0 
                            geometry  LC_Class_I
0    POINT (558536.582 6644266.183)          14
1    POINT (537326.582 6706766.183)          14
2    POINT (570326.582 6791476.183)          14
3    POINT (589536.582 6641636.183)          14
4    POINT (570366.582 6791256.183)          14
..                              ...         ...
995  POINT (567116.582 6638566.183)          14
996  POINT (560166.582 6631566.183)          14
997  POINT (531026.582 6673936.183)          14
998  POINT (626916.582 6808026.183)          14
999  POINT (566926.582 6628686.183)          14

[1000 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/1000 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (1000, 67)
Calinski-Harabasz score for  2  clusters is:  382.27341619360374
Calinski-Harabasz score for  3  clusters is:  335.2928472820683
Calinski-Harabasz score for  4  clusters is:  298.2932886791621
Calinski-Harabasz score for  5  clusters is:  271.2812999473915
Best number of clusters for class 14.0: 2
Number of training data after filtering using reference map:  17
Number of training data after filtering based on frequency:  17
filtered training data:
       LC_Class_I  blue_0  blue_1  blue_2  blue_3  blue_4  blue_5  green_0  \
0            1.0   530.0   526.0   772.0   876.0   640.0   602.0    862.0   
2            1.0   923.0   835.0   834.0   918.0  1020.0  1108.0   1185.0   
3            1.0   616.0   525.0   574.0   603.0   676.0   884.0    875.0   
6            1.0   885.0   652.0   821.0   988.0   957.0   836.0   1211.0   
7            1.0  1267.0  1093.0  1062.0  1128.0  128

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
