This notebook loads in satellite data, predict land cover using a pre-trained random forest model.

In [1]:
%matplotlib inline
import os
import datacube
import warnings
import time
import numpy as np
from scipy import stats
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from odc.io.cgroups import get_cpu_quota
from odc.algo import xr_geomedian
import xarray as xr
from joblib import load
from deafrica_tools.classification import predict_xr
from deafrica_tools.dask import create_local_dask_cluster
from deafrica_tools.datahandling import load_ard
from deafrica_tools.bandindices import calculate_indices
from deafrica_tools.plotting import display_map
from datacube.utils.cog import write_cog

# file paths and attributes
# rwanda_tiles_shp='Results/Rwanda_random_sampling_AOIs.geojson' # randomly selected small regions
rwanda_tiles_shp='Data/Rwanda_tiles_epsg32736_smaller.shp' # tiles covering the entire country

# rf_model_path='Results/RF_model_Rwanda_2021_using_2015_scheme2_stratified_samples.joblib'
rf_model_path='Results/RF_model_Rwanda_2021_using_2015_scheme2_manual_random_samples.joblib' # trained random forest model

class_name = 'LC_Class_I' # class label in integer format
crs='epsg:4326' # input crs: WGS84
output_crs='epsg:32735' # output crs: WGS84/UTM Zone 35S
fill_nan_value=-999 # value to replace nans in query results

# load and get bounding boxes of tiles covering Rwanda
rwanda_tiles=gpd.read_file(rwanda_tiles_shp).to_crs(crs)
tile_bboxes=rwanda_tiles.bounds
print('tile boundaries for Rwanda: \n',tile_bboxes)

# load trained model
rf_model = load(rf_model_path).set_params(n_jobs=1)
print('loaded random forest models:\n',rf_model)

# band mesurements for query
measurements = ['blue','green','red','red_edge_1','red_edge_2', 'red_edge_3','nir_1','nir_2','swir_1','swir_2']

# Set up a dask cluster
create_local_dask_cluster(n_workers=1)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


ncpus = 62
tile boundaries for Rwanda: 
          minx      miny       maxx      maxy
0   28.858947 -2.399463  29.128840 -2.127967
1   28.858086 -2.670073  29.128121 -2.398489
2   28.857133 -2.940681  29.127316 -2.669009
3   29.129841 -1.588116  29.399398 -1.316832
4   29.129293 -1.858813  29.398969 -1.587446
5   29.128660 -2.129509  29.398460 -1.858059
6   29.127941 -2.400203  29.397871 -2.128671
7   29.127136 -2.670897  29.397201 -2.399282
8   29.126244 -2.941589  29.396451 -2.669892
9   29.398789 -1.588571  29.668395 -1.317238
10  29.398280 -1.859345  29.667998 -1.587935
11  29.397691 -2.130118  29.667526 -1.858632
12  29.397021 -2.400891  29.666981 -2.129328
13  29.396271 -2.671662  29.666361 -2.400022
14  29.395441 -2.942431  29.665667 -2.670716
15  29.668215 -1.318144  29.937763 -1.046839
16  29.667818 -1.588990  29.937466 -1.317615
17  29.667347 -1.859836  29.937101 -1.588390
18  29.666801 -2.130681  29.936667 -1.859164
19  29.666181 -2.401525  29.936166 -2.129937
20  29.665487 

In [None]:
# define a function to feature layers
def feature_layers(query): 
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')
    # query bands
    ds = load_ard(dc=dc,
                  products=['s2_l2a'],
                  group_by='solar_day',
                  verbose=False,
                  #mask_filters=[("opening", 2)], # morphological opening by 2 pixels to remove small masked regions
                  **query)
    # calculate NDVI
    ds_index = calculate_indices(ds,index=['NDVI'],drop=False,satellite_mission='s2')
    del ds
    # calculate geomedians within each two-month interval
    ds_geomedian=ds_index.resample(time='2MS').map(xr_geomedian)
    del ds_index
#     # rechunk to a single array along time dimension so that interpolate_na can be applied: note: this may consume more memory
#     ds_geomedian=ds_geomedian.chunk({'time':-1})
#     # interpolate nodata using mean of previous and next observation
#     ds_geomedian=ds_geomedian.interpolate_na(dim='time',method='linear',use_coordinate=False)
    # stack multi-temporal measurements and rename them
    n_time=ds_geomedian.dims['time']
    list_measurements=list(ds_geomedian.keys())
    list_stack_measures=[]
    for j in range(len(list_measurements)):
        for k in range(n_time):
            variable_name=list_measurements[j]+'_'+str(k)
            measure_single=ds_geomedian[list_measurements[j]].isel(time=k).rename(variable_name)
            list_stack_measures.append(measure_single)
    ds_stacked=xr.merge(list_stack_measures,compat='override')
    return ds_stacked

In [None]:
# loop through all tiles and predict land cover
for i in range(0,len(tile_bboxes)):
    minx,miny,maxx,maxy=tile_bboxes.iloc[i]
    print('bounding box for tile ',i,': minx: ',minx,'miny: ',miny,'maxx: ',maxx,'maxy: ',maxy)
    # load Sentinel-2 data
    query = {
        'x': (minx,maxx),
        'y': (miny,maxy),
        'time': ('2021-01', '2021-12'),
        'measurements': measurements,
        'resolution': (-10, 10),
        'crs':crs,
        'output_crs':output_crs,
#         'dask_chunks' : {'x':-1, 'y':-1}
        'dask_chunks' : {'x':1700, 'y':1700}
    }
    all_data = feature_layers(query) #calculate features
    print('stacked Sentinel-2 dataset:\n',all_data)
    start_time = time.time() # start timing how long it takes for the prediction
    predicted = predict_xr(rf_model,all_data,persist=False,clean=True).compute() # predict classes of all data using the RF model
    print("%s seconds spent on predicting" % (time.time() - start_time))  # print time spent on prediction
    
    # write final prediction as cog file
    print('writing cog file...')
#     outname_prediction='Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_manual_random_samples_AOI_'+str(i)+'.tif'
    outname_prediction='Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_stratified_samples_balanced_smaller_tile_'+str(i)+'.tif'
#     outname_prediction='Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_stratified_samples_AOI_'+str(i)+'.tif'
    write_cog(predicted.Predictions, outname_prediction, overwrite=True)

In [3]:
# merge multiple tiles as a mosaic tif
# ! gdal_merge.py -o Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_manual_random_samples_AOIs_mosaic.tif -co COMPRESS=Deflate -ot Byte Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_manual_random_samples_AOI_*.tif
! gdal_merge.py -o Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_stratified_samples_balanced_smaller_mosaic.tif -co COMPRESS=Deflate -ot Byte Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_stratified_samples_balanced_smaller_tile_*.tif
# ! gdal_merge.py -o Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_stratified_samples_balanced_AOIs_mosaic.tif -co COMPRESS=Deflate -ot Byte Results/Land_cover_prediction_Rwanda_2021_using_2015_sheme2_stratified_samples_balanced_AOI_*.tif

0...10...20...30...40...50...60...70...80...90...100 - done.
