This notebook extracts training features from the Open Data Cube (ODC) of Sentinel-2 multispectral images, using unfiltered) training data in a previous year. The features include three-month rolling geomedians of Sentinel-2 bands.

### load packages and get number of cpus

In [None]:
%matplotlib inline
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
from deafrica_tools.plotting import map_shapefile
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data
from datacube.utils.cog import write_cog
import os

### input files and parameters

In [None]:
training_data_path = 'Data/train_poly_848_20171124.shp'
input_map_path='Data/moz_lulc2016_28082019_final.tif' # Note: this data provided by FNDS is too large to be uploaded to github
class_attr = 'Class_I' # class label in integer format
output_crs='epsg:32736' # WGS84/UTM Zone 36S

### load input data

In [None]:
# Load input data shapefile
training_data= gpd.read_file(training_data_path) # read training points as geopandas dataframe
training_data=training_data[[class_attr,'geometry']] # select attributes
# Plot first five rows
training_data.head()

### class merging for training data and reference map

In [None]:
dict_map={'Tree crops': 11, 'Field crops': 12, 'Forest plantations': 21, 'Grassland': 31, 'Shrubland': 33, 'Aquatic or regularly flooded shrublands': 41,
 'Aquatic or regularly flooded herbaceous vegetation': 42, 'Water body': 44, 'Settlements': 51, 'Bare soils': 61,
 'Bare rocks': 62, 'Mangrove': 70, 'Mecrusse': 71, 'Closed broadleaved (Semi-) evergreen mountaineous forest': 72,
 'Gallery forest': 73, 'Broadleaved (Semi-) deciduous closed forest': 74, 'Mopane': 75, 'Open broadleaved (Semi-) evergreen mountaineous forest': 76,
 'Coastal open woody vegetation': 77, 'Mopane open': 78, 'Miombo open': 79}

training_data.loc[training_data[class_attr]==dict_map['Shrubland'],class_attr]=dict_map['Grassland']
training_data.loc[training_data[class_attr]==dict_map['Aquatic or regularly flooded herbaceous vegetation'],
                  class_attr]=dict_map['Aquatic or regularly flooded shrublands']
training_data.loc[training_data[class_attr]==dict_map['Bare rocks'],class_attr]=dict_map['Bare soils']
training_data.loc[(training_data[class_attr]==dict_map['Gallery forest'])|
                  (training_data[class_attr]==dict_map['Open broadleaved (Semi-) evergreen mountaineous forest'])
                  |(training_data[class_attr]==dict_map['Coastal open woody vegetation']),class_attr]=dict_map['Closed broadleaved (Semi-) evergreen mountaineous forest']
training_data.loc[training_data[class_attr]==dict_map['Mopane open'],class_attr]=dict_map['Mopane']
training_data.loc[training_data[class_attr]==dict_map['Miombo open'],class_attr]=dict_map['Broadleaved (Semi-) deciduous closed forest']

# optionally export class merged training data
out_folder='Results'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
training_data.to_file('Results/train_poly_848_20171124_class_merged.geojson', driver="GeoJSON")

In [None]:
classification_map=xr.open_dataset(input_map_path,engine="rasterio").astype(np.uint8)
classification_map=classification_map.to_array().squeeze()
classification_map=classification_map.where(classification_map!=dict_map['Shrubland'],dict_map['Grassland'])
classification_map=classification_map.where(classification_map!=dict_map['Aquatic or regularly flooded herbaceous vegetation'],
                                            dict_map['Aquatic or regularly flooded shrublands'])
classification_map=classification_map.where(classification_map!=dict_map['Bare rocks'],
                                            dict_map['Bare soils'])
classification_map=classification_map.where((classification_map!=dict_map['Gallery forest'])
                                            &(classification_map!=dict_map['Open broadleaved (Semi-) evergreen mountaineous forest'])
                                            &(classification_map!=dict_map['Coastal open woody vegetation']),
                                            dict_map['Closed broadleaved (Semi-) evergreen mountaineous forest'])
classification_map=classification_map.where(classification_map!=dict_map['Mopane open'],
                                            dict_map['Mopane'])
classification_map=classification_map.where(classification_map!=dict_map['Miombo open'],
                                            dict_map['Broadleaved (Semi-) deciduous closed forest'])

In [None]:
# export class merged map for later use
write_cog(classification_map, 'Results/moz_lulc2016_28082019_final_remapped.tif', overwrite=True)

### check if S2 rolling geomedian products are fully available

In [None]:
dc = datacube.Datacube(app='check rolling geomedians')
list_products=list(dc.list_products()['name'])
if 'gm_s2_rolling' in list(list_products):
    check_query = {"x": (36.602, 36.603),
                   "y": (-16.665,-16.664),
                   "time": ("2021-01-01", "2021-03-01"),
                   "output_crs": output_crs,
                   "resolution": (-10,10)}
    ds_check = dc.load(product="gm_s2_annual",
                 **check_query)
    if ds_check.dims['time']>1:
        gm_s2_available=True
    else:
        gm_s2_available=False

### define query and feature layer function

In [None]:
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
resolution = (-10,10)
if gm_s2_available:
    query = {
        'time': ('2021-01', '2021-12'),
        'measurements': measurements,
        'output_crs': output_crs,
        'resolution': resolution
    }
    # define a function to feature layers
    def feature_layers(query): 
        # connect to the datacube so we can access DE Africa data
        dc = datacube.Datacube(app='rolling geomedians')
        # load rolling geomedians
        ds = dc.load(product='gm_s2_rolling',measurements=measurements,
                     group_by='solar_day',**query)
        ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
        n_time=ds.dims['time'] # 12
        list_measurements=list(ds.keys())
        ds_stacked=None
        for j in range(len(list_measurements)):
            for k in range(1,n_time,2): # extract the six months 2021-01, 2021-03, 2021-05,... 2021-11
                variable_name=list_measurements[j]+'_'+str(k)
                measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
                if ds_stacked is None:
                    ds_stacked=measure_single
                else:
                    ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
        return ds_stacked
else:
    query = {
        'time': ('2020-12', '2021-12'),
        'measurements': measurements,
        'output_crs': output_crs,
        'resolution': resolution
    }
    # define a function to feature layers
    def feature_layers(query):
        dc = datacube.Datacube(app='rolling geomedians')
        # load Sentinel-2 analysis ready data
        ds = load_ard(dc=dc,
                      products=['s2_l2a'],
                      group_by='solar_day',
                      verbose=False,
                      **query)
        ds = calculate_indices(ds,
                               index=['NDVI'],
                               drop=False,
                               satellite_mission='s2')
        # calculate rolling geomedians
        time_slices=[('2020-12','2021-02'),('2021-02','2021-04'),('2021-04','2021-06'),
                     ('2021-06','2021-08'),('2021-08','2021-10'),('2021-10','2021-12')]
        ds_rolling=None
        for i in range(len(time_slices)):
            ds_single=xr_geomedian(ds.sel(time=slice(time_slices[i][0],time_slices[i][1]))).assign_coords({'time':time_slices[i][0]})
            if ds_rolling is None:
                ds_rolling=ds_single
            else:
                ds_rolling=xr.concat([ds_rolling,ds_single],dim='time')
        # stackmulti-temporal measurements and rename them
        n_time=ds_rolling.dims['time']
        list_measurements=list(ds_rolling.keys())
        list_stack_measures=[]
        for j in range(len(list_measurements)):
            for k in range(n_time):
#                 variable_name=list_measurements[j]+'_'+str(k)
                variable_name=list_measurements[j]+'_'+str(2*k+1) # to keep consistent with above case
                measure_single=ds_rolling[list_measurements[j]].isel(time=k).rename(variable_name)
                list_stack_measures.append(measure_single)
        ds_stacked=xr.merge(list_stack_measures,compat='override')
        return ds_stacked    

### extract features

In [None]:
# detect the number of CPUs
ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

# collect training data
column_names, model_input = collect_training_data(
    gdf=training_data, 
    dc_query=query,
    ncpus=1, # adapt here based on your sandbox instance
    field=class_attr,
    zonal_stats=None,
    feature_func=feature_layers,
    return_coords=True)

### export training features

In [None]:
# convert the data to geopandas dataframe
pd_training_features=pd.DataFrame(data=model_input,columns=column_names)
gpd_training_features=gpd.GeoDataFrame(pd_training_features, 
                                geometry=gpd.points_from_xy(model_input[:,-2], model_input[:,-1],
                                                            crs=output_crs))
#set the name and location of the output file
output_file = "Results/Mozambique_training_features.txt"
#Export files to disk
gpd_training_features.to_csv(output_file, header=True, index=None, sep=' ')