# Defra Earth Observation Data Service API

# Introduction 
This notebook has been created to introduce and test the Defra Earth Observation Data Service API.  Follow the three steps below to import libraries and configuration file, apply query parameters to filter available data, and create a parameterised URL using the api/layers endpoint with query parameters for date, geometry and satellite.  

## 1) Import libraries and configuration file
Here we *import* the libraries required to run this notebook.
If there are any errors when running this cell (all cells can be run using *'Shift-Return'*) then check that the libraries were correctly installed through **Conda** or the package manager that you use.

Before continuing with this notebook, ensure that the **USERNAME** and **ACCESS_TOKEN** parameters in the _config.py_ file relate to the username to be used for the following commands.

Check all other _config.py_ parameters are as you would expect them, especially the URL and GEOM parameters.

In [None]:
# Import general and geospatial data handling libraries
import urllib3, requests, os, importlib, json
import logging
import pandas as pd
from pandas.io.json import json_normalize
from IPython.display import Image
from datetime import datetime
import time
import os
import pathlib
from pathlib import Path
# Import your configuration file
import config
# configure specific library calls
importlib.reload(config)

In [None]:
# initialise notebook
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
pd.options.display.max_colwidth = 100

output_fmt='%Y%m%dT%H%M%S'
pretty_fmt='%Y-%m-%d %H:%M:%S'

# create output directory if it does not exist
output_dir = Path(Path.cwd() / 'data')
if not output_dir.is_dir():
    output_dir.mkdir()

## 2) Configure query parameters
The next cell sets a series of query parameters for searching through the data on the system. You can change these and re-run the cell to investigate different scenarios. 

In [None]:
# configure variables
DATE_RANGE_START = '2019-01-01'
DATE_RANGE_END = '2019-06-01'
SATELLITE_ID = 2
TITLE = 'S2A'
TYPE = 'raster'
GEOM = "POLYGON((-1.6+51.36,0.28+51.36,0.28+51.69,-1.6+51.69,-1.6+51.36))"
# User should select APPLY_MIN_CLOUD_FUNCTION == True with APPLY_FILTERS == True
APPLY_FILTERS = True
APPLY_MIN_CLOUD_FUNCTION = True

# search locations
# England-wide footprint
# GEOM = "POLYGON((-3.70583842728179391 55.44991031769518486, -2.13623226289823265 55.44991031769518486, -0.53426308481603968 55.43372881084587078, -0.58280760536398546 54.54374593413353978, -0.63135212591193124 53.65376305742121588, 0.84116499737574024 53.58903703002395247, 2.31368212066341172 53.65376305742121588, 2.3622266412113575 52.76378018070888487, 2.39458965490998743 51.84143429029792571, 2.39458965490998743 50.90290689303765248, 2.3622266412113575 50.07765004372258488, 0.9867985590195758 50.04528703002395673, 0.53371636723875326 49.9967425094760074, -0.82553020810371081 50.04528703002395673, -2.23332130399412243 50.06146853687327081, -3.60874938618590413 50.04528703002395673, -5.42107815330919074 49.9967425094760074, -6.82886924919960148 50.04528703002395673, -5.50198568755576645 50.17473908481847644, -5.40489664645987578 50.87054387933902433, -3.6249308930352182 50.91908839988696656, -3.70583842728179391 55.44991031769518486))"
# small London footprint
# GEOM = "POLYGON((-0.09799 51.49697,-0.08799 51.49697,-0.08799 51.48697,-0.09799 51.48697,-0.09799 51.49697))"

try:
    assert not (APPLY_FILTERS == False and APPLY_MIN_CLOUD_FUNCTION == True)
except AssertionError as err:
    logging.error("\n\n # CONFIG ERROR! \n If you want to use APPLY_MIN_CLOUD_FUNCTION, then set APPLY_FILTERS to 'True'")
    raise err

# do config parameter validation. ## User needs to select APPLY_MIN_CLOUD_FUNCTION == True and SATELLTE_ID == 2
try:
    assert not (APPLY_MIN_CLOUD_FUNCTION == True and SATELLITE_ID == 1)
except AssertionError as err:
    logging.error('\n # CONFIG ERROR \n You are trying to apply the "APPLY_MIN_CLOUD_FUNCTION" but have not set "SATELLTE_ID" to "2"')
    raise err

    

#****NOTES****
#
#APPLY_FILTERS = False >> You download all layers from the EO service
#APPLY_FILTERS = True >> The above filter parameters will be applied to the api query#
#
#APPLY_MIN_CLOUD_FUNCTION = False >> Applies to S2 data only, will download all S2 data as per filter parameter spec
#APPLY_MIN_CLOUD_FUNCTION = True >> Applies to S2 data only, will only download S2 granules that meet 
#    the lowest cloud cover criteria for each unique granule reference per orbit


##  3) Create parameterised URL
The following cell creates a parameterised URL using the API 'layers' endpoint and a set of query parameters. Copy the output and paste it into a browser address bar. The resultant page provides an indication of the information returned to this notebook (based on the input parameters) from the server. 

In [None]:
# Initialise the required variables using the details in your config file (config.py)

# Create URLs for the Geoserver endpoint (gs_end_point), OWS endpoint (serverURL)
gsdownload_template_payload = '<?xml version="1.0" encoding="UTF-8"?><wps:Execute version="1.0.0" service="WPS" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.opengis.net/wps/1.0.0" xmlns:wfs="http://www.opengis.net/wfs" xmlns:wps="http://www.opengis.net/wps/1.0.0" xmlns:ows="http://www.opengis.net/ows/1.1" xmlns:gml="http://www.opengis.net/gml" xmlns:ogc="http://www.opengis.net/ogc" xmlns:wcs="http://www.opengis.net/wcs/1.1.1" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="http://www.opengis.net/wps/1.0.0 http://schemas.opengis.net/wps/1.0.0/wpsAll.xsd"><ows:Identifier>gs:Download</ows:Identifier><wps:DataInputs><wps:Input><ows:Identifier>layerName</ows:Identifier><wps:Data><wps:LiteralData>@@@</wps:LiteralData></wps:Data></wps:Input><wps:Input><ows:Identifier>outputFormat</ows:Identifier><wps:Data><wps:LiteralData>image/tiff</wps:LiteralData></wps:Data></wps:Input></wps:DataInputs><wps:ResponseForm><wps:ResponseDocument storeExecuteResponse="true" status="true"><wps:Output asReference="true" mimeType="application/zip"><ows:Identifier>result</ows:Identifier></wps:Output></wps:ResponseDocument></wps:ResponseForm></wps:Execute>'
headers = {'Content-type': 'application/xml','User-Agent': 'curl/7.65.1'}
wps_server = config.URL + config.WPS_SUF + '?access_token=' + config.ACCESS_TOKEN

    
# If the APPLY_FILTERS config option is set to True then construct a URL
if APPLY_FILTERS:
    the_url = config.URL + ('/api/layers/?username=' + config.USERNAME 
                            + '&api_key=' + config.ACCESS_TOKEN
                            + '&limit=20000&offset=0'
                            + '&keywords__slug__in=sentinel-' + str(SATELLITE_ID)
                            + '&geometry=' + GEOM
                            + '&date__range=' + DATE_RANGE_START + '%2000:00,' + DATE_RANGE_END + '%2023:59'
                            + '&title__icontains=' + TITLE
                            + '&type__in=' + TYPE
                           )
    
# or if the option is not set to True 
else:
    the_url = config.URL + ('/api/layers/?username=' 
                            + config.USERNAME + '&api_key=' 
                            + config.ACCESS_TOKEN + '&limit=20000&offset=0')


print(the_url)

##  4) Make a GET request using the URL above and parse the output to a dataframe

In [None]:
# make the request to the API and parse the output to a pandas dataframe
try:
    # parse json response from the geonode api to a pandas dataframe object
    payload = requests.get(url=the_url,headers=headers)

    # create a json object of the api payload content
    the_json = json.loads(payload.content)

    # load json to pandas dataframe
    df_full_layer_list = json_normalize(the_json, 'objects')
except:
    raise Exception('ERROR. The the server response is not correctly formed. Please check you authentication token and internet connection')

if len(df_full_layer_list) == 0:
    raise Exception('ERROR. The API response was empty, no data to process or download, check the filters match data in the EO Portal')

# add granule reference and extract ARCSI_CLOUD_COVER from supplement information value
if SATELLITE_ID == 2:
    df_full_layer_list['granule-ref'] = df_full_layer_list['title'].str.split('_',n=4).str[3]
    df_full_layer_list['orbit-ref'] = df_full_layer_list['title'].str.split('_',n=5).str[-2]
    df_full_layer_list['ARCSI_CLOUD_COVER'] = df_full_layer_list['supplemental_information'].str.split(n=6).str[5]    
    
# show first five rows of the dataframe
df_full_layer_list.head(5)

##  5) Sort the dataframe by % cloud cover and group by granule name and orbit 
The following cell sorts the dataframe by % cloud cover ascending and groups the results to show the lowest cloud cover per granule, and per granule per relative orbit.  This is important because the imagery with the lowest cloud may be from an orbit where the granule lies on the edge of the swath and therefore does not provide complete coverage of the granule footprint.

In [None]:
if APPLY_MIN_CLOUD_FUNCTION and SATELLITE_ID == 2:
    df_min_cloud_per_granule = df_full_layer_list.sort_values("ARCSI_CLOUD_COVER").groupby(["granule-ref"], as_index=False).first()  
    df_min_cloud_per_granule_per_orbit = df_full_layer_list.sort_values("ARCSI_CLOUD_COVER").groupby(["granule-ref",'orbit-ref'], as_index=False).first()

##  6) View and download the results 

In [None]:
# list the column headers of the 'min cloud per granule' data frame
df_min_cloud_per_granule.columns

In [None]:
# show the 'min cloud per granule' dataframe with just the most relevant columns
df_min_cloud_per_granule[["granule-ref","orbit-ref","date","ARCSI_CLOUD_COVER"]]

In [None]:
# output the minimum cloud lists as a CSV to a the 'data' directory
df_min_cloud_per_granule.to_csv(Path(output_dir / 'min_cloud_per_granule.csv'))
df_min_cloud_per_granule_per_orbit.to_csv(Path(output_dir / 'min-cloud-per-granule-per-orbit.csv'))

# Check the 'data' directory in your Scripts folder, open the two CSV files

In [None]:
# show the list of filenames in the 'minimum cloud per granule' list
list_to_download = df_min_cloud_per_granule.title.tolist()
list_to_download

In [None]:
# show the list of filenames in the 'minimum cloud per granule per orbit' list
list_to_download_orbit = df_min_cloud_per_granule_per_orbit.title.tolist()
list_to_download_orbit