This notebook extracts training features from the Open Data Cube (ODC) of Sentinel-2 multispectral images, using unfiltered) training data in a previous year. The features include bi-monthly geomedian of Sentinel-2 bands and semi-annual Median Absolute Deviations (MADs).

### load packages and get number of cpus

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data

ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

### input files and parameters

In [None]:
# file paths and attributes
traning_points_path = 'Data/train_poly_848_20171124.shp' # please replace with your own training data 
class_name = 'Class_I' # class label in integer format
crs='epsg:32736' # WGS84/UTM Zone 36S
zonal_stats = None

training_points_2017= gpd.read_file(traning_points_path).to_crs(crs) # read training points as geopandas dataframe
training_points_2017=training_points_2017[[class_name,'geometry']] # select attributes
training_points_2017[class_name]=training_points_2017[class_name].astype(int)
print('Training points in 2016:\n',training_points_2017)

### define query and feature layer function

In [None]:
# define query
query = {
    'time': ('2021-01', '2021-12'),
    'output_crs': crs,
    'resolution': (-10, 10)
}
# define a function to feature layers
def feature_layers(query):
    measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','swir_1','swir_2']
    measurements_MAD=['smad','emad','bcmad']
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  measurements=measurements,
                  group_by='solar_day',
                  verbose=False,
#                   mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
    # interpolate nodata using mean of previous and next observation
#     ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False,fill_value='extrapolate')
    # calculate geomedians within each two-month interval
    ds=ds.resample(time='2MS').map(xr_geomedian)
    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    ds_stacked=None
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            # print ('Stacking band ',list_measurements[j],' at time ',k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            if ds_stacked is None:
                ds_stacked=measure_single
            else:
                ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
    # load semiannual MADs
    ds_mads=dc.load(product='gm_s2_semiannual',
                    measurements=measurements_MAD,
                    **query
                   )
    # stack multi-temporal bands as variables
    n_time=ds_mads.dims['time']
    list_measurements=list(ds_mads.keys())
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            measure_single=ds_mads[list_measurements[j]].isel(time=k).rename(variable_name)
            ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
    return ds_stacked

### extract features

In [None]:
# extract training features
column_names, model_input = collect_training_data(gdf=training_points_2017,
                                                  dc_query=query,
#                                                   ncpus=1,
                                                  ncpus=35,
                                                  field=class_name,
                                                  zonal_stats=zonal_stats,
                                                  feature_func=feature_layers,
                                                  return_coords=True)
print('Number of training data after removing Nans and Infs: ',model_input.shape[0])
# first covert the training data to pandas
training_data_2017=pd.DataFrame(data=model_input,columns=column_names)
# then to geopandas dataframe
training_data_2017=gpd.GeoDataFrame(training_data_2017, 
                                geometry=gpd.points_from_xy(model_input[:,-2], model_input[:,-1],
                                                            crs=crs))

### export training features

In [None]:
output_file='Results/train_poly_848_20171124_signatures_2021.geojson'
training_data_2017.to_file(output_file, driver="GeoJSON")
# also save as txt file
output_file ='Results/train_poly_848_20171124_signatures_2021.txt'
training_data_2017.to_csv(output_file, header=True, index=None, sep=' ')