This notebook finetunes field extent and boundary probability thresholds used for crop field instance segmentation. It implements grid search to find the thresholds with highest IoUs using validation dataset, then implements crop field instance segmentation using the finetuned thresholds.

## Load packages and modules

In [1]:
import numpy as np
import imageio.v2 as imageio
from osgeo import gdal
import os
import higra as hg
import matplotlib.pyplot as plt
from glob import glob
from skimage import measure
import pandas as pd
import sys

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# import functions from modules
from datasets import export_geotiff
from instance_segment import InstSegm
from evaluation import Calculate_IoUs,get_accuracy_scores

## Define parameters

In [3]:
# input folder for predictions
input_folder='results/averaged'

# input folder for groundtruth field exent chunks
groundtruth_folder='../0_Data_preparation/results/groundtruth'

# output folder to store instance segmentation results
out_folder=input_folder

# hyperparameter values: extent and boundary probability thresholds
t_exts = np.linspace(0.1, 0.6, 6)
t_bounds = np.linspace(0.0, 0.3, 4)

# proportion of randomly selected validation samples to be used for thresholds fine-tuning
pct_samples=0.8

# whether to save results as a pandas dataframe
save_grd_search=True

country = 'Mozambique'
str_year='2021'

## Identify predicted exent and boundary chunks and groundtruth field extents

In [4]:
files_extent_predictions=glob(input_folder+'/'+country+'*average_extent_prob*.tif')
print('Found {} field exent probability images'.format(len(files_extent_predictions)))

Found 63 field exent probability images


In [None]:
files_extent_true=glob(groundtruth_folder+'/*crop_field_extent*.tif')
print('Found {} true field extent images'.format(len(files_extent_true)))

## Do instance segmentation for a proportion of samples and find thresholds with highest mean IoU

- Select a proportion of validation chunks
- Grid search boundary and extent probability thresholds and calculate IoUs
- Find the thresholds with highest median IoU

In [8]:
# randomly samples from the validation dataset
n_samples=len(files_extent_true)
random_inds=np.random.choice(n_samples, int(pct_samples*n_samples), replace=False)

In [13]:
%%time
mIoUs = [] # median IoU
IoU_50s = [] # fraction of fields with >half overlap
for t_ext in t_exts:
    for t_bound in t_bounds:
        best_IoUs_all=[]
        # loop through all selected files
        for idx in random_inds:
            
            # groundtruth exent
            file_extent_true=files_extent_true[idx]
            
            # get chunk id
            chunk_id='_'.join(os.path.basename(file_extent_true)[:-4].split('_')[-2:])
            fn_prefix='_'.join([country,'average_extent_prob',str_year,'*',chunk_id])+'.tif'
            
            # find corresponding extent prediction chunk
            list_files=glob(input_folder+'/'+fn_prefix)
            
            if len(list_files)>0:
                file_exent_prediction=list_files[0]
                file_bound_prediction=file_exent_prediction.replace('extent','bound')

                # read in files
                extent_true=imageio.imread(file_extent_true)
                extent_prob=imageio.imread(file_exent_prediction)
                bound_prob=imageio.imread(file_bound_prediction)

                # do segmentation using current thresholds
                instances_predicted=InstSegm(extent_prob, bound_prob, t_ext=t_ext, t_bound=t_bound)
                # label connected regions, non-field (-1) will be labelled as 0
                instances_predicted= measure.label(instances_predicted, background=-1,return_num=False)

                # label groundtruth crop fields
                instances_true= measure.label(extent_true, background=-1,return_num=False)

                # calculate IoU
                best_IoUs, field_sizes=Calculate_IoUs(instances_true, instances_predicted, plot=False)
                best_IoUs_all.extend(best_IoUs)
        mIoUs.append(np.median(best_IoUs_all))
        IoU_50s.append(np.sum(np.array(best_IoUs_all) > 0.5) / len(best_IoUs_all))

hp_df = pd.DataFrame({
    't_ext': np.repeat(t_exts, len(t_bounds)),
    't_bound': np.tile(t_bounds, len(t_exts)),
    'mIoU': mIoUs,
    'IoU_50': IoU_50s
})
# save results as a pandas dataframe
if save_grd_search:
    hp_df.to_csv(os.path.join(out_folder,'grid_search_thresholds.csv'))
print('segmentation thresholds with highest mIoU:\n',hp_df.iloc[hp_df['mIoU'].idxmax()])

segmentation thresholds with highest mIoU:
 t_ext      0.300000
t_bound    0.100000
mIoU       0.350159
IoU_50     0.211664
Name: 9, dtype: float64
CPU times: user 5min 53s, sys: 11.2 s, total: 6min 5s
Wall time: 5min 15s


## Do instance segmentation for all samples, evaluate and export results

In [14]:
%%time
# use best thresholds
t_ext_best=hp_df.iloc[hp_df['mIoU'].idxmax()]['t_ext']
t_bnd_best=hp_df.iloc[hp_df['mIoU'].idxmax()]['t_bound']
# list of IoUs
best_IoUs_all=[]
# loop through all validation chunks
for file_extent_true in files_extent_true:
    # get chunk id
    chunk_id='_'.join(os.path.basename(file_extent_true)[:-4].split('_')[-2:])
    fn_prefix='_'.join([country,'average_extent_prob',str_year,'*',chunk_id])+'.tif'

    # find corresponding extent prediction chunk
    list_files=glob(input_folder+'/'+fn_prefix)

    if len(list_files)>0:
        file_exent_prediction=list_files[0]
        
        # extract geo information using gdal
        ds = gdal.Open(file_exent_prediction)
        geotrans=ds.GetGeoTransform()
        proj=ds.GetProjection()
        ds=None
        # corresponding boundary chunk file
        file_bound_prediction=file_exent_prediction.replace('extent','bound')

        # read in arrays
        extent_true=imageio.imread(file_extent_true)
        extent_prob=imageio.imread(file_exent_prediction)
        bound_prob=imageio.imread(file_bound_prediction)

        # do segmentation using selected thresholds
        instances_predicted=InstSegm(extent_prob, bound_prob, t_ext=t_ext_best, t_bound=t_bnd_best)
        # label connected regions, non-field (-1) will be labelled as 0
        instances_predicted= measure.label(instances_predicted, background=-1,return_num=False)

        # label groundtruth crop fields
        instances_true= measure.label(extent_true, background=-1,return_num=False)

        # calculate IoU
        best_IoUs, field_sizes=Calculate_IoUs(instances_true, instances_predicted, plot=False)
        best_IoUs_all.extend(best_IoUs)
        
        # export instances as geotiff
        outname=os.path.join(out_folder,os.path.basename(file_exent_prediction).replace('extent_prob','field_instance'))
        export_geotiff(outname,instances_predicted,geotrans,proj,gdal.GDT_Int16)

m_IoU=np.median(best_IoUs_all)
IoU_50=np.sum(np.array(best_IoUs_all) > 0.5) / len(best_IoUs_all)
print('median IoU using the best threholds: ',m_IoU)
print('IoU_50 using the best threholds: ',IoU_50)

median IoU using the best threholds:  0.3668639053254438
IoU_50 using the best threholds:  0.22844617632732586
CPU times: user 18.4 s, sys: 631 ms, total: 19 s
Wall time: 16.1 s
