# Testing

* **Products used:** 


### Background 

### Description


***

## Getting started


### Load packages
Load key Python packages and any supporting functions for the analysis.

In [16]:
import datacube
import datacube.utils.rio
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
import pandas as pd
import xarray as xr
import geopandas as gpd
from odc.io.cgroups import get_cpu_quota
from sklearn.metrics import mean_absolute_error
from datacube.utils.geometry import assign_crs
from deafrica_tools.datahandling import load_ard
from deafrica_tools.plotting import map_shapefile
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data

#This will speed up loading data
datacube.utils.rio.set_default_rio_config(aws='auto', cloud_defaults=True)

### Analysis parameters


In [2]:
path = 'data/harmonization_training_sample.geojson' 
field = 'Class'
# Set up the inputs for the ODC query
time = ('2020')
measurements =  ['red','nir']
resolution = (-30,30)
output_crs='epsg:6933'

sample_size=10000

In [3]:
ncpus=round(get_cpu_quota())
print('ncpus = '+str(ncpus))

ncpus = 31


## View the selected location
The next cell will display the selected area on an interactive map.
The red border represents the area of interest of the study.
Zoom in and out to get a better understanding of the area of interest.
Clicking anywhere on the map will reveal the latitude and longitude coordinates of the clicked point.

In [4]:
# Load input data shapefile
input_data = gpd.read_file(path).sample(sample_size).reset_index(drop=True)
input_data.to_file('data/harmonization_training_sample.geojson')#save out the sample
input_data.head()

Unnamed: 0,Class,geometry
0,0,POINT (21.03863 -6.91308)
1,1,POINT (-2.92536 14.26489)
2,1,POINT (23.57781 13.61702)
3,0,POINT (9.45360 -1.28441)
4,0,POINT (48.19668 -14.14469)


In [5]:
print('crop = '+str(len(input_data[input_data.Class==1])))
print('noncrop = '+str(len(input_data[input_data.Class==0])))

crop = 3387
noncrop = 6613


In [6]:
input_data.explore(column=field, cmap='viridis')

In [7]:
query = {
    'time':time,
    'measurements': measurements,
    'resolution': resolution,
    'output_crs': output_crs
}

def feature_layers(query):
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    
    #load landsat 8
    ds_ls = load_ard(dc=dc,
                  products=['ls8_sr'],
                  min_gooddata=0.99,
                  verbose=False,
                  **query,
                 )
    
    #remove outliers
    ds_ls = ds_ls.where(ds_ls>0)
    ds_ls = ds_ls.where(ds_ls<1)
    
    #ndvi
    ds_ls=calculate_indices(ds_ls, 'NDVI', collection='c2', drop=False)
    
    # load cloud-masked fractional cover using load_ard
    ds_s2 = load_ard(dc=dc,
                  products=['s2_l2a'],
                  like=ds_ls.geobox,
                  time=time,
                  measurements=['red','nir_2'], #use nir narrow to match with LS8
                  min_gooddata=0.99,
                  verbose=False
                 )
    
    #remove SR outliers
    ds_s2 = ds_s2.where(ds_s2>0)
    ds_s2 = ds_s2.where(ds_s2<10000)
    
    #rename nir2 to trick calculate_indices
    ds_s2 = ds_s2.rename({'nir_2':'nir'})
    ds_s2 = calculate_indices(ds_s2, 'NDVI', collection='s2', drop=False)
    
    #match s2 at LS times with 2D tolerance
    ds_s2 = ds_s2.reindex(time=ds_ls.time, method='nearest', tolerance='2D')
    
    #grab the timestep with Landsat maximum NDVI, this will ensure we don't get a NaN
    try:
        t_max = ds_s2.NDVI.idxmax("time")
        ds_s2 = ds_s2.sel(time=t_max)
        ds_ls = ds_ls.sel(time=t_max)
        #rename bands to include sensor
        for band in ds_ls.data_vars:
            ds_ls = ds_ls.rename({band: band + '_ls8'})

        #rename bands to include sensor
        for band in ds_s2.data_vars:
            ds_s2 = ds_s2.rename({band: band + '_s2'})

        #merge results into single dataset 
        result = xr.merge([ds_ls, ds_s2],compat='override')
        result = assign_crs(result, crs='epsg:6933')
    
    except:
        # If above fails return an array filled with NaNs
        # that matches the output of the successful ones
        ds_s2 = ds_s2.mean('time')
        ds_ls = ds_ls.mean('time')
        
        #rename bands to include sensor
        for band in ds_ls.data_vars:
            ds_ls = ds_ls.rename({band: band + '_ls8'})

        #rename bands to include sensor
        for band in ds_s2.data_vars:
            ds_s2 = ds_s2.rename({band: band + '_s2'})
        
        result = xr.merge([ds_ls, ds_s2],compat='override')
        result = result.where(result>100000) #make everything NaN
        result = assign_crs(result, crs='epsg:6933')
    
    return result

In [8]:
column_names, model_input = collect_training_data(
                                    gdf=input_data,
                                    dc_query=query,
                                    ncpus=ncpus,
                                    field=field,
                                    feature_func=feature_layers,
                                    )

Collecting training data in parallel mode


  0%|          | 0/10000 [00:00<?, ?it/s]

CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)


Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (9930, 7)


In [9]:
df = pd.DataFrame(data=model_input, columns=column_names)
df.head()

Unnamed: 0,Class,red_ls8,nir_ls8,NDVI_ls8,red_s2,nir_s2,NDVI_s2
0,0.0,0.084405,0.37574,0.633137,676.0,3470.0,0.673903
1,0.0,0.063505,0.348873,0.692006,706.0,3216.0,0.63998
2,0.0,0.04365,0.406155,0.805916,453.0,3860.0,0.789937
3,0.0,0.11438,0.246903,0.366811,1315.0,2610.0,0.329936
4,0.0,0.08259,0.318953,0.588636,462.0,1393.0,0.501887


In [14]:
df.to_csv('results/ndvi_ls8_s2.csv')