In [1]:
import subprocess as sp
import warnings

import datacube
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
from datacube.utils import geometry
from datacube.utils.geometry import assign_crs
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.classification import collect_training_data, HiddenPrints
from deafrica_tools.datahandling import load_ard
from deafrica_tools.spatial import xr_rasterize
from deafrica_tools.temporal import temporal_statistics, xr_phenology
from odc.io.cgroups import get_cpu_quota
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

## Explore Input Data

In [2]:
input_data_file = "data/senegal_croptype.geojson"
input_data = gpd.read_file(input_data_file)

input_data.explore(column="Class")

In [3]:
input_data_sorted = input_data.sort_values(by="area_ha", ascending=False)
input_data_sorted.head()

Unnamed: 0,area_ha,Class,geometry
232,0.99938,millet,"POLYGON ((-15.05446 13.53437, -15.05441 13.534..."
269,0.997653,millet,"POLYGON ((-14.51386 14.04193, -14.51387 14.041..."
2,0.99617,rice_lowland,"POLYGON ((-12.21719 12.55064, -12.21719 12.550..."
188,0.993469,rice_highland,"POLYGON ((-15.14480 13.39296, -15.14480 13.392..."
137,0.988836,rice_highland,"POLYGON ((-15.14581 13.39184, -15.14578 13.391..."


In [4]:
le = LabelEncoder()
input_data_sorted.Class = le.fit_transform(input_data_sorted.Class)

classes = list(le.classes_)
input_data_sorted["Labels"] = le.inverse_transform(input_data_sorted.Class)

#reset index
input_data_sorted = input_data_sorted.reset_index(drop=True)

## Ideas for variables

1. Phenology
    * How does crop rotation work? How much data can you associate with a given field? e.g. a mean/median over a few months?
    * How much data do you need to calculate phenology? Possibly a year?
    * What other time based measures could I look at? 
2. Radar backscatter
    * Could distinguish rice paddocks from others?
    * Could distinguish flooded rice paddocks (might be more likely lowland) from unflooded (might be more likely highland)
    * Can put in VV, VH and ratio
3. Optical
    * Standard bands
    * Indices like NDVI or EVI
4. Fractional Cover?
    * Proportion of PV to NPV to BS might be useful or interesting. Depends on how much BS can be seen through these crops, and whether the FC algorithm returns reliable/consistent results
5. Slope/Elevation
    * Might distinguish rice_highland and rice_lowland, but uncertain on how these were classified by people, so can't assume anything
6. Size of paddock
    * Uncertain whether there would be anything in this, unlikely to be consistent

In [5]:
dc = datacube.Datacube(app="crop_type_ml")

In [6]:
# General query
time = "2020"
measurements = ["red", "green", "blue", "nir"]
resolution = (-20, 20)
output_crs = "EPSG:6933"

query = {
    "time": time,
    "measurements": measurements,
    "resolution": resolution,
    "output_crs": output_crs,
}

### Phenology

In [7]:
def phenology_feature_layers(query):

    # Set phenology parameters
    veg_index = "NDVI"
    resample_period = "2W"
    window = 4
    basic_pheno_stats = [
        "SOS",
        "vSOS",
        "POS",
        "vPOS",
        "EOS",
        "vEOS",
        "Trough",
        "LOS",
        "AOS",
        "ROG",
        "ROS",
    ]
    method_sos = "first"
    method_eos = "last"

    # Connnect to datacube
    dc = datacube.Datacube(app="crop_type_ml")

    # Load S2 for query
    ds = load_ard(
        dc=dc,
        products=["s2_l2a"],
        mask_filters=[("opening", 3), ("dilation", 3)],
        **query,
        verbose=False
    )

    # Calculate NDVI
    ds = calculate_indices(ds, index=veg_index, collection="s2")

    # Smooth the NDVI over time
    ds_smooth = (
        ds[veg_index]
        .resample(time=resample_period)
        .median()
        .rolling(time=window, min_periods=1)
        .mean()
    )

    #print("Calculating Phenology")
    stats = xr_phenology(
        ds_smooth,
        method_sos=method_sos,
        method_eos=method_eos,
        stats=basic_pheno_stats,
        verbose=False,
    )

    return stats

In [8]:
# set up our inputs to collect_training_data
zonal_stats = "mean"
field = "Class"

ncpus = 20  # round(get_cpu_quota())
# print('ncpus = '+str(ncpus))

column_names, model_input = collect_training_data(
    gdf=input_data_sorted[0:100],
    dc_query=query,
    ncpus=ncpus,
    field=field,
    zonal_stats=zonal_stats,
    feature_func=phenology_feature_layers,
)

Taking zonal statistic: mean
Collecting training data in parallel mode


  0%|          | 0/100 [00:00<?, ?it/s]

CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)
CPLReleaseMutex: Error = 1 (Operation not permitted)


Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (100, 12)


In [9]:
print(column_names)

['Class', 'SOS', 'vSOS', 'POS', 'vPOS', 'EOS', 'vEOS', 'Trough', 'LOS', 'AOS', 'ROG', 'ROS']


In [10]:
print(np.array_str(model_input, precision=2, suppress_small=True))

[[  3.   177.67   0.12 ...   0.59   0.01   0.  ]
 [  3.   167.17   0.15 ...   0.44   0.     0.  ]
 [  3.   186.46   0.12 ...   0.44   0.     0.  ]
 ...
 [  2.   170.2    0.11 ...   0.61   0.01  -0.01]
 [  4.   147.95   0.13 ...   0.55   0.     0.  ]
 [  4.   133.1    0.18 ...   0.5    0.     0.  ]]
