This notebook extracts training features from the Open Data Cube (ODC) of Sentinel-2 multispectral images, using unfiltered) training data in a previous year. The features include three-month rolling geomedians of Sentinel-2 bands.

### load packages and get number of cpus

In [13]:
%matplotlib inline
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
from deafrica_tools.plotting import map_shapefile
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data
from datacube.utils.cog import write_cog
import os

### input files and parameters

In [14]:
training_data_path = 'Data/train_poly_848_20171124.shp'
input_map_path='Data/moz_lulc2016_28082019_final.tif'
class_attr = 'Class_I' # class label in integer format
output_crs='epsg:32736' # WGS84/UTM Zone 36S

### load input data

In [15]:
# Load input data shapefile
training_data= gpd.read_file(training_data_path) # read training points as geopandas dataframe
training_data=training_data[[class_attr,'geometry']] # select attributes
# Plot first five rows
training_data.head()

Unnamed: 0,Class_I,geometry
0,51,"POLYGON ((1617790.363 -1879014.857, 1617787.18..."
1,76,"POLYGON ((1632621.302 -1883379.171, 1632620.43..."
2,42,"POLYGON ((1626447.303 -1892631.641, 1626447.01..."
3,76,"POLYGON ((1619226.349 -1895421.518, 1619226.05..."
4,41,"POLYGON ((1624004.000 -1903092.189, 1624003.70..."


### class merging for training data and reference map

In [16]:
dict_map={'Tree crops': 11, 'Field crops': 12, 'Forest plantations': 21, 'Grassland': 31, 'Shrubland': 33, 'Aquatic or regularly flooded shrublands': 41,
 'Aquatic or regularly flooded herbaceous vegetation': 42, 'Water body': 44, 'Settlements': 51, 'Bare soils': 61,
 'Bare rocks': 62, 'Mangrove': 70, 'Mecrusse': 71, 'Closed broadleaved (Semi-) evergreen mountaineous forest': 72,
 'Gallery forest': 73, 'Broadleaved (Semi-) deciduous closed forest': 74, 'Mopane': 75, 'Open broadleaved (Semi-) evergreen mountaineous forest': 76,
 'Coastal open woody vegetation': 77, 'Mopane open': 78, 'Miombo open': 79}

training_data.loc[training_data[class_attr]==dict_map['Shrubland'],class_attr]=dict_map['Grassland']
training_data.loc[training_data[class_attr]==dict_map['Aquatic or regularly flooded herbaceous vegetation'],
                  class_attr]=dict_map['Aquatic or regularly flooded shrublands']
training_data.loc[training_data[class_attr]==dict_map['Bare rocks'],class_attr]=dict_map['Bare soils']
training_data.loc[(training_data[class_attr]==dict_map['Gallery forest'])|
                  (training_data[class_attr]==dict_map['Open broadleaved (Semi-) evergreen mountaineous forest'])
                  |(training_data[class_attr]==dict_map['Coastal open woody vegetation']),class_attr]=dict_map['Closed broadleaved (Semi-) evergreen mountaineous forest']
training_data.loc[training_data[class_attr]==dict_map['Mopane open'],class_attr]=dict_map['Mopane']
training_data.loc[training_data[class_attr]==dict_map['Miombo open'],class_attr]=dict_map['Broadleaved (Semi-) deciduous closed forest']

# optionally export class merged training data
out_folder='Results'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
training_data.to_file('Results/train_poly_848_20171124_class_merged.geojson', driver="GeoJSON")

In [18]:
classification_map=xr.open_dataset(input_map_path,engine="rasterio").astype(np.uint8)
classification_map=classification_map.to_array().squeeze()
classification_map=classification_map.where(classification_map!=dict_map['Shrubland'],dict_map['Grassland'])
classification_map=classification_map.where(classification_map!=dict_map['Aquatic or regularly flooded herbaceous vegetation'],
                                            dict_map['Aquatic or regularly flooded shrublands'])
classification_map=classification_map.where(classification_map!=dict_map['Bare rocks'],
                                            dict_map['Bare soils'])
classification_map=classification_map.where(classification_map!=dict_map['Gallery forest'],
                                            dict_map['Open broadleaved (Semi-) evergreen mountaineous forest'])
classification_map=classification_map.where(classification_map!=dict_map['Coastal open woody vegetation'],
                                            dict_map['Closed broadleaved (Semi-) evergreen mountaineous forest'])
classification_map=classification_map.where(classification_map!=dict_map['Mopane open'],
                                            dict_map['Mopane'])
classification_map=classification_map.where(classification_map!=dict_map['Miombo open'],
                                            dict_map['Broadleaved (Semi-) deciduous closed forest'])

In [19]:
# export class merged map for later use
write_cog(classification_map, 'Results/moz_lulc2016_28082019_final_remapped.tif', overwrite=True)

PosixPath('Results/moz_lulc2016_28082019_final_remapped.tif')

In [20]:
np.unique(classification_map)

array([ 0, 11, 12, 21, 24, 26, 31, 41, 44, 51, 61, 70, 71, 72, 74, 75, 76],
      dtype=uint8)

### define query and feature layer function

In [None]:
#set up our inputs to collect_training_data
zonal_stats = None
# Set up the inputs for the ODC query
time = ('2021')
# using spectral bands with 10~20 m spatial resolution
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
resolution = (-10,10)
query = {
    'time': time,
    'measurements': measurements,
    'output_crs': output_crs,
    'resolution': resolution
}
# define a function to feature layers
def feature_layers(query): 
    # connect to the datacube so we can access DE Africa data
    dc = datacube.Datacube(app='rolling geomedians')
    
    # load Sentinel-2 analysis ready data
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  group_by='solar_day',
                  verbose=False,
                  **query)
    
    # calculate NDVI
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')

    # calculate bi-monthly geomedian
    ds=ds.resample(time='2MS').map(xr_geomedian)
    
    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    list_stack_measures=[]
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            list_stack_measures.append(measure_single)
    ds_stacked=xr.merge(list_stack_measures,compat='override')
    return ds_stacked

### extract features

In [None]:
# detect the number of CPUs
ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

# collect training data
column_names, model_input = collect_training_data(
    gdf=training_data, # replace with gdf=training_data if you are extracting all the training data
    dc_query=query,
    ncpus=ncpus,
    field=class_attr,
    zonal_stats=zonal_stats,
    feature_func=feature_layers,
    return_coords=True)

### export training features

In [None]:
# convert the data to geopandas dataframe
pd_training_features=pd.DataFrame(data=model_input,columns=column_names)
#set the name and location of the output file
output_file = "Results/Mozambique_training_features.txt"
#Export files to disk
pd_training_features.to_csv(output_file, header=True, index=None, sep=' ')