This notebook implements outlier removal method to filter training data.

### load packages and get number of cpus

In [None]:
%matplotlib inline
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from datacube.utils.cog import write_cog
from sklearn.preprocessing import StandardScaler
from rasterio.enums import Resampling
from sklearn.neighbors import LocalOutlierFactor

ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

### load input files and set parameters

In [None]:
# file paths and attributes
training_signature_path='Results/train_poly_848_20171124_signatures_2021.geojson' # extracted training features
rf2017_path='Data/moz_lulc2016_28082019_final.tif' # reference map

crs='epsg:32736' # WGS84/UTM Zone 36S
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','swir_1','swir_2','NDVI']
measurements_MAD=['smad','emad','bcmad']
class_name = 'Class_I' # class label in integer format
column_names=[class_name]
for measurement in measurements:
    for i in range(6):
        column_names.append(measurement+'_'+str(i))
for measurement in measurements_MAD:
    for i in range(2):
        column_names.append(measurement+'_'+str(i))
        
# dictionary of class value - class name before and after merging
dict_map={11:'Tree crops',12:'Field crops',21:'Forest plantations',31:'Grassland',
          33:'Shrubland',41:'Aquatic or regularly flooded shrublands',42:'Aquatic or regularly flooded herbaceous vegetation',
          44:'Water body',51:'Settlements',61:'Bare soils',62:'Bare rocks',70:'Mangrove',
          71:'Mecrusse',72:'Closed broadleaved (Semi-) evergreen mountaineous forest',73:'Gallery forest',
          74: 'Broadleaved (Semi-) deciduous closed forest',75:'Mopane',76:'Open broadleaved (Semi-) evergreen mountaineous forest',
          77:'Coastal open woody vegetation',78:'Mopane open',79:'Miombo open'}
dict_map_merged={11:'Tree crops',12:'Field crops',21:'Forest plantations',31:'Grassland',
                 41:'Aquatic or regularly flooded herbaceous vegetation',44:'Water body',
                 51:'Settlements',61:'Bare soils',70:'Mangrove',71:'Mecrusse',
                72:'Broadleaved (Semi-) evergreen forest',74:'Broadleaved (Semi-) deciduous forest',75:'Mopane'} # dictionary of merged classes

# Load extracted features and reproject
training_data2017= gpd.read_file(training_signature_path).to_crs(crs) # read training points as geopandas dataframe
column_names.append('geometry')
training_data2017=training_data2017[column_names] # select attributes
training_data2017[class_name]=training_data2017[class_name].astype(int)
print('land cover survey points 2017:\n',training_data2017)

# load initial classification map
rf_2017_raster = xr.open_dataset(rf2017_path,engine="rasterio").astype(np.uint8).squeeze("band", drop=True)
# # reproject the raster
rf_2017_raster= rf_2017_raster.rio.reproject(resolution=30, dst_crs=crs,resampling=Resampling.nearest)
rf_2017_raster=rf_2017_raster.band_data
print('Reference land cover classifcation raster:\n',rf_2017_raster) # note: 255 is nodata

### merge classes

In [None]:
# oringal and merged class values
original_class=[11, 12, 21, 31,33, 41,42, 44, 51, 61,62, 70, 71, 72,73,76,77, 75,78, 74,79]
original_class=dict_map.keys()
mapped_class=[11, 12, 21, 31, 31,41,41, 44, 51, 61, 61,70, 71, 72,72,72,72, 75,75, 74,74]
rf_2017_mapped=rf_2017_raster.copy()
crs_copy_2017=rf_2017_raster.rio.crs

# merge classes on reference map
for i in range(len(original_class)):
    rf_2017_mapped=xr.where(rf_2017_raster==original_class[i],mapped_class[i],rf_2017_mapped)
if rf_2017_mapped.rio.crs is None: # reassign crs which was lost during last step of using xr.where
    rf_2017_mapped.rio.write_crs(crs_copy_2017,inplace=True)
if rf_2017_mapped.rio.crs!=crs: # reproject 2015 land cover map if needed
    rf_2017_mapped=rf_2017_mapped.rio.reproject(resolution=30, dst_crs=crs,resampling=Resampling.nearest)

# # export class-merged reference map
# write_cog(rf_2017_mapped, 'Results/moz_lulc2016_28082019_final_remapped.tif', overwrite=True)

# merge classes on training data 
training_data2017.loc[training_data2017[class_name]==33,class_name]=31
training_data2017.loc[training_data2017[class_name]==42,class_name]=41
training_data2017.loc[training_data2017[class_name]==62,class_name]=61
training_data2017.loc[(training_data2017[class_name]==73)|(training_data2017[class_name]==76)
                     |(training_data2017[class_name]==77),class_name]=72
training_data2017.loc[training_data2017[class_name]==78,class_name]=75
training_data2017.loc[training_data2017[class_name]==79,class_name]=74

## export class-merged training data
# training_data2017.to_file('Results/train_poly_848_20171124_signatures_2021_remapped.geojson', driver="GeoJSON")

# get merged class labels
lc_classes=training_data2017[class_name].unique() 
print('land cover classes:\n',lc_classes)

### filter out a pre-defined percentage of samples using the Local Outlier Factor (LOF)

In [None]:
td2021_filtered=None # filtered training data
scaler = StandardScaler() # standard scaler for input data standardisation
# filtering training data for each class
for i in lc_classes:
    print('Processing class ',i)
    # subset original training points for this class
    td_single_class=training_data2017[training_data2017[class_name]==i].reset_index(drop=True)
    print('Number of training data collected: ',len(td_single_class))
    # normalisation before clustering
    model_input=scaler.fit_transform(td_single_class.to_numpy()[:,1:-1])
    clf = LocalOutlierFactor(contamination=0.15) # 15% samples were assumed outliers
    y_pred=clf.fit_predict(model_input)
    # append prediction to features dataframe
    td_single_class_filtered=td_single_class.copy()
    td_single_class_filtered['LOF_pred']=y_pred
    td_single_class_filtered=td_single_class_filtered[td_single_class_filtered['LOF_pred']==1]
    print('Number of training data after filtering: ',len(td_single_class_filtered))
    # save filtered results for single class
    td_single_class_filtered.to_file('Results/train_poly_848_20171124_signatures_2021_force_15pct_filtered_class_'+str(i)+'.geojson', driver="GeoJSON")
    # append the filtered training points of this class to final filtered training data
    if td2021_filtered is None:
        td2021_filtered=td_single_class_filtered
    else:
        td2021_filtered=pd.concat([td2021_filtered, td_single_class_filtered])
        
# remove NaNs which were somehow export as zeros during extraction of training data
td2021_filtered=td2021_filtered.loc[(td2021_filtered!=0).all(axis=1)].reset_index(drop=True)
print('training data after removing nans\n',td2021_filtered)

# save training data for all classes
td2021_filtered.to_file('Results/train_poly_848_20171124_signatures_2021_force_15pct_filtered.geojson', driver="GeoJSON")

# export the filtered training data as txt file
output_file = "Results/train_poly_848_20171124_signatures_2021_force_15pct_filtered.txt"
td2021_filtered.to_csv(output_file, header=True, index=None, sep=' ')