# COG

## Initialise COG

### Load packages

In [1]:
%load_ext autoreload

import os
import sys
import html
import requests
import gdal
import rasterio
import geopandas as gpd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

from datetime import datetime
from lxml import etree


sys.path.append('../../../Scripts')
from dea_dask import create_local_dask_cluster

sys.path.append('../../shared')
import satfetcher

  shapely_geos_version, geos_capi_version_string


### Set up a dask cluster

In [2]:
# initialise the cluster
create_local_dask_cluster()

0,1
Client  Scheduler: tcp://127.0.0.1:45905  Dashboard: /user/lewis/proxy/8787/status,Cluster  Workers: 1  Cores: 2  Memory: 13.11 GB


## Get study area polygon

### Load study area geometry as geojson

In [3]:
# read study area and get bounds as a list
gdf = gpd.read_file('../../data/cog/yandisa.geojson')
gdf_bounds = gdf.bounds.values[0].tolist()

## Set STAC Search parameters

In [4]:
# get satellite collection on dea. todo get from user in arcgis, sentinel 2
collections = [
    'ga_ls5t_ard_3', 
    'ga_ls7e_ard_3',
    'ga_ls8c_ard_3'
]

# set required bands
bands = [
    'nbart_blue', 
    'nbart_green', 
    'nbart_red', 
    'nbart_nir',
    'nbart_swir_1',
    'nbart_swir_2',
    'oa_fmask'
]

# get satellite collection date range, convert to stac. todo get from user in arcgis
start_dt, end_dt = '1990-01-01', '1995-12-31'

# bring it all together for a query
query = {
    'collections': collections,
    'datetime': '{0}/{1}'.format(start_dt, end_dt),
    'bbox': gdf_bounds,
    'query': {'eo:cloud_cover': {'lt': 5}}, #this doesnt work
    'limit': 1000
}

## Fetch DEA Public Data via STAC Search

In [5]:
# set stac endpoint
search_endpoint = 'https://explorer.sandbox.dea.ga.gov.au/stac/search'

# send and get request for stac json using 
stac_response = requests.post(search_endpoint, json=query)

# check for response empty errors, convert to json if so
if stac_response.ok:
    stac_response = stac_response.json()
    num_items = len(stac_response.get('features'))
    print('Found {0} satellite scenes in total.'.format(num_items))
else:
    raise ValueError('Could not connect to DEA STAC SEARCH endpoint.')

Found 146 satellite scenes in total.


## Iterate STAC response and remove cloud cover

In [6]:
# set max cloud cover (0 - 100)
max_cloud = 25

# get num of all stac scenes
num_all_items = len(stac_response.get('features'))

feat_list = []
for feat in stac_response.get('features'):
    if max_cloud > float(feat.get('properties').get('eo:cloud_cover')):
        feat_list.append(feat)
        
# count cloud less scenes and compare
if feat_list:
    num_clean_items = len(feat_list)
    print('Removed {0} satellite scenes due to clouds.'.format(num_all_items - num_clean_items))
    print('Total of {0} satellite scenes remaining.'.format(num_clean_items))

Removed 23 satellite scenes due to clouds.
Total of 123 satellite scenes remaining.


## Build VRTs for each band of each scene in STAC response

In [7]:
# get dict of band names and associated vrt lists
band_vrt_dict = satfetcher.get_dea_landsat_vrt_dict(feat_list)

Getting landsat vrts for each relevant bands.
Building landsat vrt list for band: nbart_blue.
Building landsat vrt list for band: nbart_green.
Building landsat vrt list for band: nbart_red.
Building landsat vrt list for band: nbart_nir.
Building landsat vrt list for band: nbart_swir_1.
Building landsat vrt list for band: nbart_swir_2.
Building landsat vrt list for band: oa_fmask.
Got 7 landsat vrt band lists successfully.


## Combine VRTs into file for each band set

In [8]:
# loop each band and combine vrts into one per band
vrt_file_dict = satfetcher.combine_vrts_per_band(band_vrt_dict)

Combining VRTs into single VRTs per band.
Combining VRTs into temp. file for band: nbart_blue.
Combining VRTs into temp. file for band: nbart_green.
Combining VRTs into temp. file for band: nbart_red.
Combining VRTs into temp. file for band: nbart_nir.
Combining VRTs into temp. file for band: nbart_swir_1.
Combining VRTs into temp. file for band: nbart_swir_2.
Combining VRTs into temp. file for band: oa_fmask.
Combined 7 band vrt lists successfully.


## Extract datetimes from vrt file for each band

In [9]:
# loop each band and extract datetimes for each
vrt_dt_dict = satfetcher.get_vrt_file_datetimes(vrt_file_dict)

Extracting datetimes for VRTs per band.
Extracting datetimes from VRTs for band: nbart_blue.
Extracting datetimes from VRTs for band: nbart_green.
Extracting datetimes from VRTs for band: nbart_red.
Extracting datetimes from VRTs for band: nbart_nir.
Extracting datetimes from VRTs for band: nbart_swir_1.
Extracting datetimes from VRTs for band: nbart_swir_2.
Extracting datetimes from VRTs for band: oa_fmask.
Extracted 7 band vrt datetimes successfully.


## Combine VRT files and datetimes into a single dict

In [10]:
# combine vrt files and datetimes for each band into final dict
vrt_dict = satfetcher.prepare_full_vrt_dicts(vrt_file_dict, vrt_dt_dict)

Combining vrt files and datetimes per band.
Combined vrt files and datetimes per band successfully.


In [11]:
raise

RuntimeError: No active exception to reraise

## Build xarray datasets

In [35]:
# create in-memory dataset from vrt dictionary
ds = satfetcher.build_xr_datasets(vrt_dict=vrt_dict)

# show ds
#ds

Building an xarray dataset from vrt files and datetimes.
Working on dataset for band: nbart_blue
Working on dataset for band: nbart_green
Working on dataset for band: nbart_red
Working on dataset for band: nbart_nir
Working on dataset for band: nbart_swir_1
Working on dataset for band: nbart_swir_2
Working on dataset for band: oa_fmask
Built an xarray dataset successfully.


## Subset size of dataset to study area bounds

In [36]:
# subset func
ds = ds.isel(x=slice(4000, 5000), y=slice(3000, 4000))

# show ds
#ds

In [37]:
ds = ds.compute()

KeyboardInterrupt: 



## Test download times

### Try raw, without dask

In [None]:
# speed testing without dask distributed
%time ds = ds.compute()

### Try raw, without dask but with threading

In [None]:
# speed testing without dask distributed
%time ds = ds.compute(scheduler='threads')

### Try raw, without dask but with processes

In [None]:
# speed testing without dask distributed
%time ds = ds.compute(scheduler='processes')

### Try dask, with distributed scheduler

In [None]:
import dask
from dask.distributed import Client
client = Client(processes=True)
client

In [None]:
# about 47 secs with processes=false, 21 secs when True
%time ds = ds.compute()

### Try dask data arrays split and futures used

In [None]:
import concurrent.futures 

# create compute func
def compute_da(da):
    return da.compute()

In [None]:
# split ds into seperate das
da_list = []
for dt in ds['time']:
    da = ds.sel(time=dt)
    da_list.append(da)
    
# try parallel load of all bands
num_cores = 2
with concurrent.futures.ThreadPoolExecutor(num_cores) as executor:
    %time da_list = list(executor.map(compute_da, da_list))
    
ds = xr.concat(da_list, dim='time')

## Working

## Use this to auto gen vrt to test

In [None]:
# really good test env
vrt_1_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/02/09/ga_ls5t_nbart_3-0-0_112076_1990-02-09_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/02/09/ga_ls5t_nbart_3-0-0_112076_1990-02-09_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/02/09/ga_ls5t_nbart_3-0-0_112076_1990-02-09_final_band03.tif']
vrt1 = gdal.BuildVRT('vrt_1.vrt', vrt_1_urls, separate=True).FlushCache()

 
vrt_2_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/13/ga_ls5t_nbart_3-0-0_112076_1990-03-13_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/13/ga_ls5t_nbart_3-0-0_112076_1990-03-13_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/13/ga_ls5t_nbart_3-0-0_112076_1990-03-13_final_band03.tif']
vrt2 = gdal.BuildVRT('vrt_2.vrt', vrt_2_urls, separate=True).FlushCache()


vrt_3_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/111/076/1990/03/22/ga_ls5t_nbart_3-0-0_111076_1990-03-22_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/111/076/1990/03/22/ga_ls5t_nbart_3-0-0_111076_1990-03-22_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/111/076/1990/03/22/ga_ls5t_nbart_3-0-0_111076_1990-03-22_final_band03.tif']
vrt3 = gdal.BuildVRT('vrt_3.vrt', vrt_3_urls, separate=True).FlushCache()


vrt_4_urls = [
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/29/ga_ls5t_nbart_3-0-0_112076_1990-03-29_final_band01.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/29/ga_ls5t_nbart_3-0-0_112076_1990-03-29_final_band02.tif',
    '/vsicurl/https://data.dea.ga.gov.au/baseline/ga_ls5t_ard_3/112/076/1990/03/29/ga_ls5t_nbart_3-0-0_112076_1990-03-29_final_band03.tif']
vrt4 = gdal.BuildVRT('vrt_4.vrt', vrt_4_urls, separate=True).FlushCache()

# add to list
vrt_list = ['vrt_1.vrt', 'vrt_2.vrt', 'vrt_3.vrt', 'vrt_4.vrt']
vrt_out = gdal.BuildVRT('vrt_all.vrt', vrt_list, separate=False, bandList=[1]).FlushCache()


# read it in to memory and decode it
#vrt_all = tmp.read().decode("utf-8")

# setup chunks
chunks = {'band': 1, 'x': 'auto', 'y': 'auto'}
ds = xr.open_rasterio('vrt_all.vrt', chunks=chunks)
ds = ds.isel(x=slice(2500, 3000), y=slice(2500, 3000))
ds.compute()