# Stratified random sampling from NDWI mosaic

## Load packages

In [1]:
%matplotlib inline

import os
import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt

import sys
sys.path.append('../Scripts')
from deafrica_spatialtools import xr_rasterize


In [2]:
# define area name

area_name = 'Southern'

## Convert to one mosaic (only do it once)

In [3]:
# make tif

if not os.path.exists(f"NDWI_composite/{area_name.lower()}_NDWI_mosaic.tif"):
    os.chdir('NDWI_composite')
    os.system(f"gdalbuildvrt {area_name.lower()}_NDWI_mosaic.vrt {area_name.lower()}_NDWI_tile*.tif")
    os.system("gdal_translate "\
       "-co BIGTIFF=YES "\
       "-co COMPRESS=DEFLATE "\
       "-co ZLEVEL=9 "\
       "-co PREDICTOR=1 "\
       "-co TILED=YES "\
       "-co BLOCKXSIZE=1024 "\
       "-co BLOCKYSIZE=1024 "\
       +f"{area_name.lower()}_NDWI_mosaic.vrt "+ f"{area_name.lower()}_NDWI_mosaic.tif")
    os.chdir('../')

## Load NDWI mosaic and clip to AEZ (TODO: use AEZ-large_water_bodies)

In [4]:
if not os.path.exists(area_name): os.mkdir(area_name)

In [5]:
ds = xr.open_rasterio(f"NDWI_composite/{area_name.lower()}_NDWI_mosaic.tif").squeeze()

In [6]:
#load shapefile
#gdf = gpd.read_file(f'../../shapes/simplified_AEZs/{area_name}.shp')
gdf = gpd.read_file(f'../../shapes/AEZs_ExcludeLargeWB/AEZs_ExcludeLargeWB_update_{area_name}.shp')

#rasterize shapeile
mask = xr_rasterize(gdf=gdf,
                     da=ds)

ds = ds.where(mask)
ds = ds.where(ds!=0)

  projstring = _prepare_from_string(projparams)


Rasterizing to match xarray.DataArray dimensions (104514, 93681) and projection system/CRS (e.g. +init=epsg:6933)


In [7]:
dataset = ds.to_dataset(name='ndwi')

In [8]:

#ds.plot.imshow();

In [9]:
del mask

## Check NDWI distribution and determine thresholds

In [3]:
# 5 bins
freq_thresh = [0.1, 0.3, 0.6, 0.9]
n_class = len(freq_thresh)+1
frac_sample = [0.1, 0.1, 0.2, 0.3, 0.3]

In [None]:
if not os.path.exists(f'{area_name}/ndwi_{area_name}.csv'):
    histy, histx, tmp = dataset.ndwi.plot.hist(bins=100, cumulative=True, density=True);
    np.savetxt(f'{area_name}/ndwi_{area_name}.csv', np.vstack((histx[1:], histy)).transpose(),fmt='%.3f', delimiter=',')

In [None]:
# use wofs
x, y, t = np.loadtxt(f'wofs_summary_aez/wofs_{area_name}.csv', delimiter=',', unpack=True)
perc = np.interp(freq_thresh, x, y)
print('percentile for ephemeral and permanent water', perc)
histx, histy = np.loadtxt(f'{area_name}/ndwi_{area_name}.csv', delimiter=',', unpack=True)
thresh = np.interp(perc, histy, histx)
print('Thresholds', thresh)


* Southern Thresholds [-0.071488 -0.048836 -0.000376  0.033232]
* Eastern Thresholds [-0.0627   -0.043991 -0.035081  0.030288]
* Central Thresholds [-0.108416 -0.083291 -0.048377 -0.013617]

## Classify into bins of different water detection frequencies

In [11]:
# fix thresholds
thresh = [-0.07, -0.05, -0.03, 0.03]

label = np.zeros_like(dataset.ndwi.values, dtype=np.uint8)

label +=(dataset.ndwi.values<thresh[0]).astype(np.uint8)*1
for i in range(2, n_class):
    label += ((dataset.ndwi.values>=thresh[i-2]) & (dataset.ndwi.values<thresh[i-1])).astype(np.uint8)*i

label += (dataset.ndwi.values>=thresh[-1]).astype(np.uint8)*n_class

dataset['label'] = ('y','x'), label
dataset['label'].attrs = dataset.ndwi.attrs

  
  
  
  # Remove the CWD from sys.path while we load stuff.


In [12]:
# save classes

from datacube.utils.cog import write_cog

write_cog(dataset.label, f'{area_name}/{area_name}_label.tif')

PosixPath('Southern/Southern_label.tif')

## If the labels are alreay saved, read the labels

In [4]:
#
data = xr.open_rasterio(f'{area_name}/{area_name}_label.tif').squeeze()
dataset = data.to_dataset(name='label')

## sample from array

In [13]:
#del ds

In [None]:
# this will take a while, and we already know only class 1 (dry) is dominant

#class_sizes =[]
#for class_id in np.arange(1, n_class+1):
#    class_sizes.append((dataset.label==class_id).sum().values)

#class_sizes = np.array(class_sizes)
#print(class_sizes)
#print(class_sizes/class_sizes.sum())

In [5]:
if area_name in ['Western', 'Eastern', 'Southern', 'Central']: 
    n_sample = 500
else: n_sample = 300

# distribute points across classes
n_sample_class = (n_sample * np.array(frac_sample)).astype(int) #np.ceil(n_sample*1./ n_class).astype(int)
print(n_sample_class)

[ 50  50 100 150 150]


In [15]:
def pick_random_common(labelarray, label, n_sample, min_dist=0, return_index=True):
    """
    Pick random samples from a prevalent class
    
    """
    picked_y = np.array([], dtype=int)
    picked_x = np.array([], dtype=int)
    while len(picked_y) < n_sample:
        n_to_pick = n_sample - len(picked_y)
        # over sample without knowing total number of points in this class
        n_sample_over = 5*n_to_pick
        random_x = np.random.choice(np.arange(len(labelarray.x)), n_sample_over, replace=False)
        random_y = np.random.choice(np.arange(len(labelarray.y)), n_sample_over, replace=False)
        # keep points matching label
        match = labelarray.values[random_y, random_x] == label
        random_y, random_x = random_y[match], random_x[match]
        # remove points too close to previously picked ones
        if len(picked_y) > 0 and min_dist > 0:
            dist = np.sqrt((random_y-picked_y[:, None])**2 + (random_x-picked_x[:, None])**2)
            keep = dist.min(axis=0) >= min_dist
            random_y, random_x = random_y[keep], random_x[keep]
        # remove points too close to others
        if min_dist > 0:
            dist = np.sqrt((random_y-random_y[:, None])**2 + (random_x-random_x[:, None])**2)
            # set distances to themselves to min_dist
            dist[np.arange(len(random_y)), np.arange(len(random_x))] = min_dist
            keep = dist.min(axis=0) >= min_dist
            random_y, random_x = random_y[keep], random_x[keep]
        # remove extra points
        if len(random_y) > n_to_pick:
            pick = np.random.choice(np.arange(len(random_y)), n_to_pick, replace=False)
            random_y, random_x = random_y[pick], random_x[pick]
        picked_y, picked_x = np.concatenate((picked_y, random_y)), np.concatenate((picked_x, random_x))
    
    if return_index: return np.array(picked_y), np.array(picked_x)
    else: return labelarray.y[np.array(picked_y)].values, labelarray.x[np.array(picked_x)].values


def pick_random_rare(labelarray, label, n_sample, min_dist=0, return_index=True, n_points_per_batch=1):
    """
    Pick random samples from a rare class
    
    """
    
    # points will be picked from flattened index arrays
    da_shape = labalarray.values.shape
    index = np.argwhere(labelarray.values.flatten() == label).squeeze()
    index_y, index_x = np.unravel_index(index, da_shape)
    
    picked_y = []
    picked_x = []    
    # pick one or a few points at a time
    while len(picked_y) < n_sample:
        picked = np.random.choice(index, n_points_per_batch, replace=False)
        # convert back to x, y 
        random_y, random_x  = np.unravel_index(picked, da_shape)
        if n_points_per_batch > 1 and min_dist > 0:
            # remove points too close to others
            dist = np.sqrt((random_y-random_y[:, None])**2 + (random_x-random_x[:, None])**2)
            dist[np.arange(len(random_y)),np.arange(len(random_x))] = min_dist
            keep = dist.min(axis=0) >= min_dist
            random_y, random_x = random_y[keep], random_x[keep]
        if min_dist > 0:
            # remove nearby points in the index array
            keep = np.sqrt((index_y-random_y[:, None])**2 + (index_x-random_x[:, None])**2).min(axis=0) >= min_dist
            index, index_y, index_x = index[keep], index_y[keep], index_x[keep]
        picked_y, picked_x = np.concatenate((picked_y, random_y)), np.concatenate((picked_x, random_x))
        
    if len(picked_y) > n_sample:
        pick = np.random.choice(np.arange(len(picked_y)), n_sample, replace=False)
        picked_y, picked_x = picked_y[pick], picked_x[pick]
    
    if return_index: return np.array(picked_y), np.array(picked_x)
    else: return labelarray.y[np.array(picked_y)].values, labelarray.x[np.array(picked_x)].values


In [6]:
%%time

from skimage.morphology import disk

min_dist_between_class = True


min_dist = 1000 # this is x, y index, so 30 km 

if min_dist_between_class:
    # buffer around picked points
    offset_y, offset_x = np.where(disk(min_dist)==1)
    offset_y, offset_x = offset_y-min_dist, offset_x-min_dist

    labelarray = dataset.label.copy()
    da_shape = labelarray.values.shape
else:
    labelarray = dataset.label


label_picked = {}

class_id = 1
y, x = pick_random_common(labelarray, class_id, n_sample_class[class_id-1], min_dist=min_dist, return_index=True)
label_picked[class_id] = (y, x)

# clear out areas next to picked points
if min_dist_between_class:
    for yx in zip(y,x):
        buffer_y, buffer_x = yx[0]+offset_y, yx[1]+offset_x
        # within boundary
        mask_ind = (buffer_y>=0) & (buffer_x>=0) & (buffer_y<da_shape[0]) & (buffer_x<da_shape[1])
        labelarray.values[buffer_y[mask_ind], buffer_x[mask_ind]] = 0
    
for class_id in np.arange(2, n_class+1):
    
    y, x = pick_random_rare(labelarray, class_id, n_sample_class[class_id-1], min_dist=min_dist, return_index=True,
                            n_points_per_batch=10)
    label_picked[class_id] = (y, x)

    # clear out areas next to picked points
    if min_dist_between_class:
        for yx in zip(y, x):
            buffer_y, buffer_x = yx[0]+offset_y, yx[1]+offset_x
            # within boundary
            mask_ind = (buffer_y>=0) & (buffer_x>=0) & (buffer_y<da_shape[0]) & (buffer_x<da_shape[1])
            labelarray.values[buffer_y[mask_ind], buffer_x[mask_ind]] = 0


CPU times: user 1min 37s, sys: 32.4 s, total: 2min 9s
Wall time: 2min 9s


In [7]:
for class_id in np.arange(1, n_class+1):
    y, x = label_picked[class_id]
    df = pd.DataFrame({'y': dataset.y[y].values, 'x':dataset.x[x].values})
    #df = pd.read_csv(f'{area_name}/{area_name}_class_{class_id}.csv', header=None, names=['y','x'] )
    df['class']=class_id
    if class_id ==1: 
        dfs = df
    else: 
        dfs = dfs.append(df, ignore_index=True)

len(dfs)

500

In [8]:
gdf = gpd.GeoDataFrame(
        dfs,
        crs=dataset.label.crs,
        geometry=gpd.points_from_xy(dfs.x, dfs.y)).reset_index()

gdf = gdf.drop(['x', 'y'],axis=1)

if min_dist_between_class:
    gdf.to_file(f'{area_name}/{area_name}_samples_min_dist_between_class.shp')
else:
    gdf.to_file(f'{area_name}/{area_name}_samples.shp')