# Stratified random sampling from NDWI mosaic

## Load packages

In [4]:
%matplotlib inline

import os
import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt

import sys
sys.path.append('../../Scripts')
from deafrica_spatialtools import xr_rasterize


In [5]:
# define area name

area_name = 'Central'

## Convert to one mosaic (only do it once)

In [6]:
# make tif

if not os.path.exists(f"NDWI_composite/{area_name.lower()}_NDWI_mosaic.tif"):
    os.chdir('NDWI_composite')
    os.system(f"gdalbuildvrt {area_name.lower()}_NDWI_mosaic.vrt {area_name.lower()}_NDWI_tile*.tif")
    os.system("gdal_translate "\
       "-co BIGTIFF=YES "\
       "-co COMPRESS=DEFLATE "\
       "-co ZLEVEL=9 "\
       "-co PREDICTOR=1 "\
       "-co TILED=YES "\
       "-co BLOCKXSIZE=1024 "\
       "-co BLOCKYSIZE=1024 "\
       +f"{area_name.lower()}_NDWI_mosaic.vrt "+ f"{area_name.lower()}_NDWI_mosaic.tif")
    os.chdir('../')

## Load NDWI mosaic and clip to AEZ (TODO: use AEZ-large_water_bodies)

In [None]:
if not os.path.exists(area_name): os.mkdir(area_name)

In [None]:
ds = xr.open_rasterio(f"NDWI_composite/{area_name.lower()}_NDWI_mosaic.tif").squeeze()

In [None]:
#load shapefile
gdf = gpd.read_file(f'../../../../shapes/simplified_AEZs/{area_name}.shp')
#rasterize shapeile
mask = xr_rasterize(gdf=gdf,
                     da=ds)

ds = ds.where(mask)
ds = ds.where(ds!=0)

In [None]:
dataset = ds.to_dataset(name='ndwi')

In [None]:

#ds.plot.imshow();

In [None]:
del mask

## Check NDWI distribution and determine thresholds

In [None]:
if not os.path.exists(f'{area_name}/ndwi_{area_name}.csv'):
    histy, histx, tmp = dataset.ndwi.plot.hist(bins=100, cumulative=True, density=True);
    np.savetxt(f'{area_name}/ndwi_{area_name}.csv', np.vstack((histx[1:], histy)).transpose(),fmt='%.3f', delimiter=',')

In [None]:
# use wofs
x, y = np.loadtxt(f'wofs_summary_aez/wofs_{area_name}.csv', delimiter=',', unpack=True)
ephem = 0.1
perm = 0.9
perc = np.interp([ephem, perm], x, y)
print('percentile for ephemeral and permanent water', perc)
histx, histy = np.loadtxt(f'{area_name}/ndwi_{area_name}.csv', delimiter=',', unpack=True)
thresh = np.interp(perc, histy, histx)
print('Thresholds', thresh)

## Classify into dry, ephemeral and permanent

In [None]:
#threshold = {'Eastern': (-0.06, 0.03),
#             'Western': (-0.14, -0.035)
##            }
#low, high = threshold[area_name]

low, high = thresh[0], thresh[1]

label = np.zeros_like(dataset.ndwi.values, dtype=np.uint8)
label += (dataset.ndwi.values>=high).astype(np.uint8)*3
label += ((dataset.ndwi.values>=low) & (dataset.ndwi.values<high)).astype(np.uint8)*2
label +=(dataset.ndwi.values<low).astype(np.uint8)*1
dataset['label'] = ('y','x'), label
dataset['label'].attrs = dataset.ndwi.attrs

In [1]:
# save classes

from datacube.utils.cog import write_cog

write_cog(dataset.label, f'{area_name}/{area_name}_label.tif')

NameError: name 'dataset' is not defined

## plot classified ndwi

In [None]:
#dataset.label.plot.imshow(figsize=(10,10));
#plt.savefig(f'{area_name}_ndwi_classes.png')

## sample from array

In [None]:
del ds

In [None]:
n_class= 3
class_sizes =[]
for class_id in np.arange(1, n_class+1):
    class_sizes.append((dataset.label==class_id).sum().values)

class_sizes = np.array(class_sizes)
print(class_sizes)
print(class_sizes/class_sizes.sum())

In [None]:
n_sample = 500
n_sample_class = np.ceil(n_sample*1./ n_class).astype(int)
print(n_sample_class)

In [None]:
label_picked = {}
for class_id in np.arange(1, n_class+1):
    index = np.argwhere(dataset.label.values.flatten() == class_id).squeeze()
    picked = np.random.choice(index, n_sample_class, replace=False)
    # convert back to x, y 
    y, x  = np.unravel_index(picked, dataset.label.values.shape)
    label_picked[class_id] = (y, x)
    np.savetxt(f'{area_name}/{area_name}_class_{class_id}.csv', np.vstack((dataset.y[y].values, dataset.x[x].values)).transpose(),fmt='%d', delimiter=',')

In [None]:
for class_id in np.arange(1, n_class+1):
    y, x = label_picked[class_id]
    df = pd.DataFrame({'y':y, 'x':x})
    df['class']=class_id
    if class_id ==1: 
        dfs = df
    else: 
        dfs = dfs.append(df, ignore_index=True)

In [None]:
gdf = gpd.GeoDataFrame(
        dfs,
        crs=dataset.label.crs,
        geometry=gpd.points_from_xy(dfs.x, dfs.y)).reset_index()

gdf = gdf.drop(['x', 'y'],axis=1)

gdf.to_file(f'{area_name}/{area_name}_samples.shp')