## COG

## Initialise COG

### Load packages

In [79]:
%load_ext autoreload

import os
import sys
import html
import requests
import gdal
import rasterio
import geopandas as gpd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

from datetime import datetime
from lxml import etree
from tempfile import NamedTemporaryFile

sys.path.append('../../../Scripts')
from dea_dask import create_local_dask_cluster

sys.path.append('../../shared')
import satfetcher

### Set up a dask cluster

In [80]:
# initialise the cluster
create_local_dask_cluster()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38067 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://127.0.0.1:39203  Dashboard: /user/lewis/proxy/38067/status,Cluster  Workers: 1  Cores: 2  Memory: 13.11 GB


## Get study area polygon

### Load study area geometry as geojson

In [81]:
# read study area and get bounds as a list
gdf = gpd.read_file('../../data/cog/yandisa.geojson')
gdf_bounds = gdf.bounds.values[0].tolist()

## Set STAC Search parameters

In [82]:
# get satellite collection on dea. todo get from user in arcgis, sentinel 2
collections = [
    'ga_ls5t_ard_3', 
    'ga_ls7e_ard_3',
    'ga_ls8c_ard_3'
]

# set required bands
bands = [
    'oa_fmask',
    'nbart_blue', 
    'nbart_green', 
    'nbart_red', 
    'nbart_nir',
    'nbart_swir_1',
    'nbart_swir_2'
]

# get satellite collection date range, convert to stac. todo get from user in arcgis
start_dt, end_dt = '1990-01-01', '1995-12-31'

# bring it all together for a query
query = {
    'collections': collections,
    'datetime': '{0}/{1}'.format(start_dt, end_dt),
    'bbox': gdf_bounds,
    'query': {'eo:cloud_cover': {'lt': 5}}, #this doesnt work
    'limit': 1000
}

## Fetch DEA Public Data via STAC Search

In [83]:
# set stac endpoint
search_endpoint = 'https://explorer.sandbox.dea.ga.gov.au/stac/search'

# send and get request for stac json using 
stac_response = requests.post(search_endpoint, json=query)

# check for response empty errors, convert to json if so
if stac_response.ok:
    stac_response = stac_response.json()
    num_items = len(stac_response.get('features'))
    print('Found {0} satellite scenes in total.'.format(num_items))
else:
    raise ValueError('Could not connect to DEA STAC SEARCH endpoint.')

Found 146 satellite scenes in total.


## Iterate STAC response and remove cloud cover

In [84]:
# set max cloud cover (0 - 100)
max_cloud = 50

# get num of all stac scenes
num_all_items = len(stac_response.get('features'))

feat_list = []
for feat in stac_response.get('features'):
    if max_cloud > float(feat.get('properties').get('eo:cloud_cover')):
        feat_list.append(feat)
        
# count cloud less scenes and compare
if feat_list:
    num_clean_items = len(feat_list)
    print('Removed {0} satellite scenes due to clouds.'.format(num_all_items - num_clean_items))
    print('Total of {0} satellite scenes remaining.'.format(num_clean_items))

Removed 10 satellite scenes due to clouds.
Total of 136 satellite scenes remaining.


## Build VRTs for each scene in STAC response

In [94]:
%autoreload

# meta, check
def get_dea_landsat_vrt_dict(feat_list):
    """
    this func is designed to take all releveant landsat bands
    on the dea public database for each scene in stac query.
    it results in a list of vrts for each band seperately and maps
    them to a dict where band name is the key, list is the value pair.
    """
    
    # imports
    from osgeo import osr
    
    # notify
    print('Getting landsat vrts for each relevant bands.')
                        
    # check features type, length
    if not isinstance(feat_list, list):
        raise TypeError('Features must be a list of xml objects.')
    elif not len(feat_list) > 0:
        raise ValueError('No features provided.')
    
    # required dea landsat ard band names
    bands = [
        'nbart_blue', 
        'nbart_green',
        'nbart_red',
        'nbart_nir',
        'nbart_swir_1',
        'nbart_swir_2',
        'oa_mask'
    ]
    
    # iter each band name and build vrt list
    band_vrts_dict = {}
    for band in bands:
        print('Building landsat vrt list for band: {}'.format(band))
        
        # get list of vrts for band and add to dict
        band_vrts_dict[band] = satfetcher.build_vrt_list(feat_list, band=band)
        
    # notify and return
    print('Got landsat vrt lists for bands: {}'.format(band_vrts_dict.keys))
    return band_vrts_dict
    
# get dict of band names and associated vrt lists
band_vrt_dict = get_dea_landsat_vrt_dict(feat_list)

Getting landsat vrts for each relevant bands.
Building landsat vrt list for band: nbart_blue
[30.0, 0.0, 565785.0, 0.0, -30.0, -2451285.0]
| 30.00, 0.00, 565785.00|
| 0.00,-30.00,-2451285.00|
| 0.00, 0.00, 1.00|
565785.0, 30.0, 0.0, -2451285.0, 0.0, -30.0


RuntimeError: No active exception to reraise

In [101]:
from osgeo import osr
osr.CoordinateTransformation(565785.0)

NotImplementedError: Wrong number or type of arguments for overloaded function 'new_CoordinateTransformation'.
  Possible C/C++ prototypes are:
    OSRCoordinateTransformationShadow::OSRCoordinateTransformationShadow(OSRSpatialReferenceShadow *,OSRSpatialReferenceShadow *)
    OSRCoordinateTransformationShadow::OSRCoordinateTransformationShadow(OSRSpatialReferenceShadow *,OSRSpatialReferenceShadow *,OGRCoordinateTransformationOptions *)


In [None]:
# check if bands in list


# ensure requested bands allowed
for b in bands:
    if b not in allowed_bands:
        raise ValueError('Requested an unsupported band.')
    
    

In [None]:
# meta, checks
def build_dea_ard_vrts(platform=None, band=None):
    """
    takes specific platform and band band names for dea public data
    """
    
    # checks
        
    
    




build_vrts(feat_list, band=['nbart_blue', 'nbart_red'])[0]

In [None]:
# create list of bands needed
wanted_bands = [
    'nbart_blue', 
    'nbart_green',
    'nbart_red',
    'nbart_nir',
    'nbart_swir_1',
    'nbart_swir_1',
    'oa_mask'
]

# build datetimes and vrts for each band
#for band in wanted_bands:

# todo, iterate this via list above
vrt_blue = generate_vrt(feat_list=feat_list, band='nbart_blue')
vrt_green = generate_vrt(feat_list=feat_list, band='nbart_green')
vrt_red = generate_vrt(feat_list=feat_list, band='nbart_red')
vrt_nir = generate_vrt(feat_list=feat_list, band='nbart_nir')
vrt_swir_1 = generate_vrt(feat_list=feat_list, band='nbart_swir_1')
vrt_swir_2 = generate_vrt(feat_list=feat_list, band='nbart_swir_2')
vrt_mask = generate_vrt(feat_list=feat_list, band='oa_mask')

## Build an completed in-memory VRT file

In [None]:
# checks, meta
def create_vrt_file(vrt_files):
    """
    """
    
    # checks
    
    # load up a temp named file and create vrt
    with NamedTemporaryFile() as tmp:

        # set vrt options
        vrt_opts = gdal.BuildVRTOptions(separate=True,
                                        #bandList=[1],
                                        #outputBounds=boundingbox,
                                        #resampleAlg='bilinear',
                                        #resolution='user',
                                        #xRes=30.0,
                                        #yRes=30.0,
                                        #outputSRS=rasterio.crs.CRS.from_epsg(3577).wkt
                                        #targetAlignedPixels=True
                                       )
        
        # build vrt, close it (to create it)
        vrt_out = gdal.BuildVRT(tmp.name, vrt_files, options=vrt_opts)
        vrt_out = None

        # warp and translate funcs
        # todo: MAY NEED

        # read it in to memory and decode it
        vrt = tmp.read().decode("utf-8")
        return vrt

In [None]:
# small subset of raster in utm 50N
#bb = [683100.0, -2542470.0, 686070.0, -2539500.0]

# todo improve this code

# create vrts
vrt_blue_out = create_vrt_file(vrt_files=vrt_blue)
vrt_green_out = create_vrt_file(vrt_files=vrt_green)
vrt_red_out = create_vrt_file(vrt_files=vrt_red)
vrt_nir_out = create_vrt_file(vrt_files=vrt_nir)
vrt_swir_1_out = create_vrt_file(vrt_files=vrt_swir_1)
vrt_swir_2_out = create_vrt_file(vrt_files=vrt_swir_2)
vrt_mask_out = create_vrt_file(vrt_files=vrt_mask)

## Parse datetime strings into map

In [None]:
def parse_datetimes(vrt_string):
    
    # convert html tags back out
    clean_elem = html.unescape(vrt_blue_out)

    # convert string to etree elements
    root = etree.fromstring(clean_elem)

    # pull descriptions out to get date times
    elem_desc = root.findall('.//Description')

    # iterate elements and pull description text
    dt_map = {}
    for i, e in enumerate(elem_desc, start=1):
        dt_map[i] = e.text
        
    return dt_map

In [None]:
# create vrts
dt_blue = parse_datetimes(vrt_string=vrt_blue_out)
dt_green = parse_datetimes(vrt_string=vrt_green_out)
dt_red = parse_datetimes(vrt_string=vrt_red_out)
dt_nir = parse_datetimes(vrt_string=vrt_nir_out)
dt_swir_1 = parse_datetimes(vrt_string=vrt_swir_1_out)
dt_swir_2 = parse_datetimes(vrt_string=vrt_swir_2_out)
dt_mask = parse_datetimes(vrt_string=vrt_mask_out)

# check if lengths are all same


## Convert to chunked dataset

In [None]:
def build_xr_dataset(vrt_file, band_name):
    
    # setup chunks
    chunks = {'band': 1, 'x': 'auto', 'y': 'auto'}
    
    # load xr as data array
    ds = xr.open_rasterio(vrt_file, chunks=chunks)
    
    # rename default band label to time
    ds = ds.rename({'band': 'time'})
    
    # convert to dataset
    ds = ds.to_dataset(name=band_name, promote_attrs=True)
    
    # subset to coords, bb todo fix this up
    ds = ds.isel(x=slice(4000, 5000), y=slice(3000, 4000))
    
    return ds

In [None]:
# create datasets
ds_blue = build_xr_dataset(vrt_file=vrt_blue_out, band_name='nbart_blue')

In [None]:
# replace datetime
def replace_datetimes(ds, dt):
    
    # replace timezone and convert numpy
    dt_dict = {}
    for k, v in dt_blue.items():
        dt_dict[k] = np.datetime64(v.replace('Z', ''))
    
    # remap
    ds['time'] = [dt_dict[i] for i in ds['time'].values.tolist()]
    return ds.sortby('time')
    
ds_blue = replace_datetimes(ds_blue, dt_blue)

In [None]:
# compute
%time ds_blue = ds_blue.compute()
ds_blue

In [None]:
# combine all dask datasets into one
#xr.merge([ds_blue, ds_green, ds_red])

## Test download times

### Try raw, without dask

In [None]:
# speed testing without dask distributed
%time ds = ds.compute()

### Try raw, without dask but with threading

In [None]:
# speed testing without dask distributed
%time ds = ds.compute(scheduler='threads')

### Try raw, without dask but with processes

In [None]:
# speed testing without dask distributed
%time ds = ds.compute(scheduler='processes')

### Try dask, with distributed scheduler

In [None]:
import dask
from dask.distributed import Client
client = Client(processes=True)
client

In [None]:
# about 47 secs with processes=false, 21 secs when True
%time ds = ds.compute()

### Try dask data arrays split and futures used

In [None]:
import concurrent.futures 

# create compute func
def compute_da(da):
    return da.compute()

In [None]:
# split ds into seperate das
da_list = []
for dt in ds['time']:
    da = ds.sel(time=dt)
    da_list.append(da)
    
# try parallel load of all bands
num_cores = 2
with concurrent.futures.ThreadPoolExecutor(num_cores) as executor:
    %time da_list = list(executor.map(compute_da, da_list))
    
ds = xr.concat(da_list, dim='time')

## Working

## Use this to auto gen vrt to test

In [None]:
# really good test env
vrt_1_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/02/09/ga_ls5t_nbart_3-0-0_112076_1990-02-09_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/02/09/ga_ls5t_nbart_3-0-0_112076_1990-02-09_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/02/09/ga_ls5t_nbart_3-0-0_112076_1990-02-09_final_band03.tif']
vrt1 = gdal.BuildVRT('vrt_1.vrt', vrt_1_urls, separate=True).FlushCache()

 
vrt_2_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/13/ga_ls5t_nbart_3-0-0_112076_1990-03-13_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/13/ga_ls5t_nbart_3-0-0_112076_1990-03-13_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/13/ga_ls5t_nbart_3-0-0_112076_1990-03-13_final_band03.tif']
vrt2 = gdal.BuildVRT('vrt_2.vrt', vrt_2_urls, separate=True).FlushCache()


vrt_3_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/111/076/1990/03/22/ga_ls5t_nbart_3-0-0_111076_1990-03-22_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/111/076/1990/03/22/ga_ls5t_nbart_3-0-0_111076_1990-03-22_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/111/076/1990/03/22/ga_ls5t_nbart_3-0-0_111076_1990-03-22_final_band03.tif']
vrt3 = gdal.BuildVRT('vrt_3.vrt', vrt_3_urls, separate=True).FlushCache()


vrt_4_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/29/ga_ls5t_nbart_3-0-0_112076_1990-03-29_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/29/ga_ls5t_nbart_3-0-0_112076_1990-03-29_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/29/ga_ls5t_nbart_3-0-0_112076_1990-03-29_final_band03.tif']
vrt4 = gdal.BuildVRT('vrt_4.vrt', vrt_4_urls, separate=True).FlushCache()

# add to list
vrt_list = ['vrt_1.vrt', 'vrt_2.vrt', 'vrt_3.vrt', 'vrt_4.vrt']
vrt_out = gdal.BuildVRT('vrt_all.vrt', vrt_list, separate=False, bandList=[1]).FlushCache()


# read it in to memory and decode it
#vrt_all = tmp.read().decode("utf-8")

# setup chunks
chunks = {'band': 1, 'x': 'auto', 'y': 'auto'}
ds = xr.open_rasterio('vrt_all.vrt', chunks=chunks)
ds = ds.isel(x=slice(2500, 3000), y=slice(2500, 3000))
ds.compute()