This notebook implements scikit-learn Local Outlier Factor (LOF) method to filter training data as alternative to kmeans filtering.

### load packages and get number of cpus

In [None]:
%matplotlib inline
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from datacube.utils.cog import write_cog
from sklearn.preprocessing import StandardScaler
from rasterio.enums import Resampling
from sklearn.neighbors import LocalOutlierFactor

ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

### set parameters

In [None]:
# file paths and attributes
training_signature_path='Results/Mozambique_training_features.txt'

rf2017_path='Results/moz_lulc2016_28082019_final_remapped.tif'

# tile shapefile to stratify the random sampling for kmeans
tiles_shp='Data/Mozambique_tiles_biggest1.shp'

crs='epsg:32736' # WGS84/UTM Zone 36S
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2','NDVI']
class_name = 'LC_Class_I' # class label in integer format
column_names=[class_name]
for measurement in measurements:
    for i in range(6):
        column_names.append(measurement+'_'+str(i))

### load training features

In [None]:
training_data2017= pd.read_csv(training_signature_path,sep=' ')
training_data2017=training_data2017[column_names] # select attributes
training_data2017[class_name]=training_data2017[class_name].astype(int)
print('land cover survey points 2017:\n',training_data2017)
lc_classes=training_data2017[class_name].unique() # get class labels

### filter out proportion of samples using LOF

In [None]:
td2021_filtered=None # filtered training data
scaler = StandardScaler() # standard scaler for input data standardisation
proportion_filter=0.15 # 15% samples were assumed outliers
# filtering training data for each class
for i in lc_classes:
    print('Processing class ',i)
    # subset original training points for this class
    td_single_class=training_data2017[training_data2017[class_name]==i].reset_index(drop=True)
    print('Number of training data collected: ',len(td_single_class))
    # normalisation before clustering
    model_input=scaler.fit_transform(td_single_class.to_numpy()[:,1:-1])
    clf = LocalOutlierFactor(contamination=proportion_filter)
    y_pred=clf.fit_predict(model_input)
    # append prediction to features dataframe
    td_single_class_filtered=td_single_class.copy()
    td_single_class_filtered['LOF_pred']=y_pred
    td_single_class_filtered=td_single_class_filtered[td_single_class_filtered['LOF_pred']==1]
    print('Number of training data after filtering: ',len(td_single_class_filtered))
    # append the filtered training points of this class to final filtered training data
    if td2021_filtered is None:
        td2021_filtered=td_single_class_filtered
    else:
        td2021_filtered=pd.concat([td2021_filtered, td_single_class_filtered])
        
# remove NaNs which were somehow export as zeros during extraction of training data
td2021_filtered=td2021_filtered.loc[(td2021_filtered!=0).all(axis=1)].reset_index(drop=True)
print('training data after removing nans\n',td2021_filtered)

### export filtered training features

In [None]:
print('filtered training data for 2021:\n',td2021_filtered)
output_file = "Results/Mozambique_training_features_filtered_LOF.txt"
td2021_filtered.to_csv(output_file, header=True, index=None, sep=' ')