This notebook extracts training data (feature layers) from the Open Data Cube (ODC) of Sentinel-2 multispectral images, using the (unfiltered) training datasets in 2021. The extracted training signatures will then be used to train a classifier and produce a reference/baseline land cover map, which will be used for training data filtering.

In [1]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from odc.algo import geomedian_with_mads, xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data

# get number of cpus
ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

# file paths and attributes
traning_points_path = 'Data/landcover_td2021.shp' # training data 2021
shrubland_points_path = 'Data/signatures_shrublandSurvey.shp' # shrubland survey points
lesotho_shp='Data/Lesotho_boundaries.shp' # Lesotho boundary shapefile
class_name = 'LC_Class_I' # class label in integer format
output_crs='epsg:32735' # output crs: WGS84/UTM Zone 35S
crs='epsg:4326' # input crs: WGS84

# Load reference land cover survey points and reproject
training_data2021= gpd.read_file(traning_points_path).to_crs(crs) # read training points as geopandas dataframe
training_data2021=training_data2021[[class_name,'geometry']] # select attributes
df_shrubs=gpd.read_file(shrubland_points_path).to_crs(crs) # read shrubland survey points
df_shrubs=df_shrubs[['land_cover','geometry']] # select attributes
dict_map={'Shrubland':9,'Trees':4,'Grassland':10,'Irrigated_Agriculture':14}
df_shrubs[class_name]=df_shrubs['land_cover'].map(dict_map) # mapping land attributes
df_shrubs=df_shrubs[[class_name,'geometry']] # select attributes
training_data2021=pd.concat([training_data2021,df_shrubs]).reset_index(drop=True) # concatenate training data
print('merged reference land cover survey points:\n',training_data2021)
# print('filtered training points:\n',training_data2021)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


ncpus = 62
merged reference land cover survey points:
       LC_Class_I                    geometry
0              1  POINT (28.13225 -29.91072)
1              1  POINT (28.58356 -30.07347)
2              1  POINT (28.64013 -29.58167)
3              1  POINT (28.35737 -30.04837)
4              1  POINT (28.62472 -29.58144)
...          ...                         ...
3576           9  POINT (28.49727 -30.05149)
3577           9  POINT (28.49652 -30.04917)
3578           9  POINT (28.49811 -30.05004)
3579           9  POINT (28.49828 -30.05074)
3580           9  POINT (27.85065 -29.42685)

[3581 rows x 2 columns]


In [2]:
# define ODC query
zonal_stats = None
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
query = {
    'time': ('2021-01', '2021-12'),
    'measurements': measurements,
    'output_crs': output_crs,
    'resolution': (-10, 10)
}

# define a function to feature layers
def feature_layers(query): 
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    # query bands
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  group_by='solar_day',
                  verbose=False,
#                   mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)
    # calculate NDVI
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission ='s2')
    # scale NDVI
    ds['NDVI']=ds['NDVI']*10000
    # calculate geomedians within each two-month interval
    ds=ds.resample(time='2MS').map(xr_geomedian)
    # interpolate nodata using mean of previous and next observation
    ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False,fill_value='extrapolate').astype(np.int16)
#     ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False)
    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    list_stack_measures=[]
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            list_stack_measures.append(measure_single)
    ds_stacked=xr.merge(list_stack_measures,compat='override')
    return ds_stacked

# extract features of training data
column_names, model_input = collect_training_data(
    gdf=training_data2021,
    dc_query=query,
#     ncpus=ncpus,
    ncpus=35,
    field=class_name,
    zonal_stats=zonal_stats,
    feature_func=feature_layers,
    return_coords=True)
print(column_names)
print(np.array_str(model_input, precision=2, suppress_small=True))

# export the filtered training data as txt file
training_data2021_filtered=pd.DataFrame(data=model_input,columns=column_names)
output_file = "Results/landcover_training_data_2021_GEE.txt"
training_data2021_filtered.to_csv(output_file, header=True, index=None, sep=' ')

# # export the filtered training data as a geojson
# training_data2021_filtered=pd.DataFrame(data=model_input,columns=column_names)
training_data2021_filtered=gpd.GeoDataFrame(training_data2021_filtered, 
                                            geometry=gpd.points_from_xy(training_data2021_filtered.x_coord,
                                                                        training_data2021_filtered.y_coord,
                                                                        crs=output_crs))
output_file = "Results/landcover_training_data_2021_GEE.geojson"
training_data2021_filtered.to_file(output_file)

Collecting training data in parallel mode


  0%|          | 0/3581 [00:00<?, ?it/s]

  data = nangeomedian_pcm(xx_data, **kw)


Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (3581, 69)
['LC_Class_I', 'blue_0', 'blue_1', 'blue_2', 'blue_3', 'blue_4', 'blue_5', 'green_0', 'green_1', 'green_2', 'green_3', 'green_4', 'green_5', 'red_0', 'red_1', 'red_2', 'red_3', 'red_4', 'red_5', 'red_edge_1_0', 'red_edge_1_1', 'red_edge_1_2', 'red_edge_1_3', 'red_edge_1_4', 'red_edge_1_5', 'red_edge_2_0', 'red_edge_2_1', 'red_edge_2_2', 'red_edge_2_3', 'red_edge_2_4', 'red_edge_2_5', 'red_edge_3_0', 'red_edge_3_1', 'red_edge_3_2', 'red_edge_3_3', 'red_edge_3_4', 'red_edge_3_5', 'nir_1_0', 'nir_1_1', 'nir_1_2', 'nir_1_3', 'nir_1_4', 'nir_1_5', 'nir_2_0', 'nir_2_1', 'nir_2_2', 'nir_2_3', 'nir_2_4', 'nir_2_5', 'swir_1_0', 'swir_1_1', 'swir_1_2', 'swir_1_3', 'swir_1_4', 'swir_1_5', 'swir_2_0', 'swir_2_1', 'swir_2_2', 'swir_2_3', 'swir_2_4', 'swir_2_5', 'NDVI_0', 'NDVI_1', 'NDVI_2', 'NDVI_3', 'NDVI_4', 'NDVI_5', 'x_coord', 'y_coord']
[[      1.     530.     526. ...    4405.  655055.