This notebook implements national land cover prediciton using the pre-trained random forest model generated from previous step.

### load packages

In [None]:
%matplotlib inline
import os
import datacube
import warnings
import time
import numpy as np
from scipy import stats
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
import xarray as xr
from joblib import load
from deafrica_tools.classification import predict_xr
from deafrica_tools.dask import create_local_dask_cluster
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.plotting import display_map
from datacube.utils.cog import write_cog

### load data and set parameters

In [None]:
# file paths and attributes
country_tiles_shp='Data/Mozambique_tiles_smaller.shp'# tiles covering the entire country
rf_model_path= 'Results/RF_model_Mozambique_2021.joblib' # trained random forest model
# rf_model_path= 'Results/RF_model_using_filtered_15ptc_td_Mozambique_2021.joblib' # trained random forest model

crs='epsg:32736' # # output crs: WGS84/UTM Zone 36S

# load and get bounding boxes of tiles covering Mozambique
country_tiles=gpd.read_file(country_tiles_shp).to_crs('epsg:4326')
tile_bboxes=country_tiles.bounds
print('tile boundaries for Mozambique: \n',tile_bboxes)

# load trained classifier
rf_models = load(rf_model_path).set_params(n_jobs=1)
print('loaded random forest model:\n',rf_models)

### define feature layer function - same as extracting features

In [None]:
# define a function to feature layers
def feature_layers(query):
    measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']

    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')

    # load data
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  measurements=measurements,
                  group_by='solar_day',
                  verbose=False,
                  #mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)

    # calcualte NDVI
    ds_index = calculate_indices(ds,index=['NDVI'],drop=False,satellite_mission='s2')

    # calculate geomedians within each two-month interval
    ds_geomedian=ds_index.resample(time='2MS').map(xr_geomedian)

    # stack multi-temporal measurements and rename them
    n_time=ds_geomedian.dims['time']
    list_measurements=list(ds_geomedian.keys())
    list_stack_measures=[]
#     ds_stacked=None
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            measure_single=ds_geomedian[list_measurements[j]].isel(time=k).rename(variable_name)
            list_stack_measures.append(measure_single)
    ds_stacked=xr.merge(list_stack_measures,compat='override')
    return ds_stacked

### set up dask cluster for parallel processing

In [None]:
# Set up a dask cluster
create_local_dask_cluster(n_workers=1)

### run prediction for all tiles and export geotiffs

In [None]:
# iterate through each tile
for i in range(len(tile_bboxes)):
    minx,miny,maxx,maxy=tile_bboxes.iloc[i]
    print('bounding box for tile ',i,': minx: ',minx,'miny: ',miny,'maxx: ',maxx,'maxy: ',maxy)

    # load Sentinel-2 data
    query = {
        'x': (minx,maxx),
        'y': (miny,maxy),
        'time': ('2021-01', '2021-12'),
        'resolution': (-10, 10),
        'crs':'epsg:4326',
        'output_crs':crs,
        'dask_chunks' : {'x':1000, 'y':1000} # change this based on your tile size and sandbox instance
    }

    # calculate features
    all_data = feature_layers(query) # making sure feature order is the same to training data
    print('stacked Sentinel-2 dataset:\n',all_data)

    # timing how long it takes for the prediction
    start_time = time.time()
    predicted = predict_xr(rf_models,all_data,proba=True,persist=False,clean=True).compute() # predict classes of all data using the RF model
    print("%s seconds spent on predicting" % (time.time() - start_time))

    # write final prediction as cog file
    print('writing cog file...')
    outname_prediction='Results/Land_cover_prediction_kmeans_filtered_td_Mozambique_tile_'+str(i)+'.tif'
    outname_probability='Results/Land_cover_probability_kmeans_filtered_td_Mozambique_tile_'+str(i)+'.tif'
    write_cog(predicted.Predictions, outname_prediction, overwrite=True)
    write_cog(predicted.Probabilities.astype(int), outname_probability, overwrite=True)