In [1]:
%matplotlib inline
import os
import datacube
import warnings
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data

ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

# file paths and attributes
traning_points_path = 'Data/trainning_samples_FNDS_II_SOM_2016.geojson'
class_name = 'LC_Class_I' # class label in integer format
crs='epsg:32736' # WGS84/UTM Zone 36S
zonal_stats = None

training_points_2016= gpd.read_file(traning_points_path).to_crs(crs) # read training points as geopandas dataframe
training_points_2016=training_points_2016[[class_name,'geometry']] # select attributes
print('Training points in 2016:\n',training_points_2016)

measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']
query = {
    'time': ('2017-01', '2017-12'),
    'measurements': measurements,
    'output_crs': crs,
    'resolution': (-10, 10)
}
# define a function to feature layers
def feature_layers(query): 
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  group_by='solar_day',
                  verbose=False,
#                   mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)
    ds = calculate_indices(ds,
                           index=['NDVI'],
                           drop=False,
                           satellite_mission='s2')
    # interpolate nodata using mean of previous and next observation
    ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False,fill_value='extrapolate')
#     ds=ds.interpolate_na(dim='time',method='linear',use_coordinate=False)
    # calculate geomedians within each two-month interval
    ds=ds.resample(time='2MS').map(xr_geomedian)
    # replace nan with a value so that the collect_training_data function will work
#     ds=ds.fillna(fill_nan_value)
    # stack multi-temporal measurements and rename them
    n_time=ds.dims['time']
    list_measurements=list(ds.keys())
    ds_stacked=None
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            # print ('Stacking band ',list_measurements[j],' at time ',k)
            measure_single=ds[list_measurements[j]].isel(time=k).rename(variable_name)
            if ds_stacked is None:
                ds_stacked=measure_single
            else:
                ds_stacked=xr.merge([ds_stacked,measure_single],compat='override')
    return ds_stacked

column_names, model_input = collect_training_data(gdf=training_points_2016,
                                                  dc_query=query,
                                                  ncpus=30,
                                                  field=class_name,
                                                  zonal_stats=zonal_stats,
                                                  feature_func=feature_layers,
                                                  return_coords=True)
print('Number of training data after removing Nans and Infs: ',model_input.shape[0])
training_data_2016=pd.DataFrame(data=model_input,columns=column_names)
# export the filtered training data as txt file
output_file = "Results/Mozambique_landcover_td2017.txt"
training_data_2016.to_csv(output_file, header=True, index=None, sep=' ')

ncpus = 31
Training points in 2016:
       LC_Class_I                         geometry
0              3   POINT (225040.711 8330067.212)
1              3   POINT (232906.211 8289456.399)
2              3   POINT (264976.866 8243769.646)
3              3   POINT (265932.457 8310425.297)
4              5   POINT (259119.061 8241013.601)
...          ...                              ...
2494           5  POINT (1293515.402 8572615.673)
2495           5  POINT (1298558.374 8598529.543)
2496           3  POINT (1285206.553 8469601.675)
2497           5  POINT (1316834.224 8283879.633)
2498           5  POINT (1284216.688 8260744.431)

[2499 rows x 2 columns]
Collecting training data in parallel mode


  0%|          | 0/2499 [00:00<?, ?it/s]

CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)


Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (2499, 69)
Number of training data after removing Nans and Infs:  2499
