## Create Custom Container From Dockerfile

Create a dockerfile which implements the environment for the Workflow to run in.   
Once built, push to the GEOAnalytics Container Registry to be used in the Workflow.

In [34]:
! pip install --user docker

Collecting docker
  Obtaining dependency information for docker from https://files.pythonhosted.org/packages/db/be/3032490fa33b36ddc8c4b1da3252c6f974e7133f1a50de00c6b85cca203a/docker-6.1.3-py3-none-any.whl.metadata
  Downloading docker-6.1.3-py3-none-any.whl.metadata (3.5 kB)
Downloading docker-6.1.3-py3-none-any.whl (148 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: docker
Successfully installed docker-6.1.3


In [22]:
import os
import io
import getpass
import docker

In [23]:
# DinD (Docker in Docker sidecar is accessible over localhost
docker_client = docker.from_env()
docker_client

<docker.client.DockerClient at 0x7fe1f6ec1450>

In [24]:
dockerfile = """
FROM python:3.11-slim
RUN apt update --no-install-recommends \
    >>/dev/null
    
RUN pip install \
        pandas==2.0.3 \
        geopandas==0.13.2 \
        numpy==1.24.4 \
        pystac_client==0.7.2 \
        planetary-computer==1.0.0 \
        odc-stac==0.3.6 \
        azure-storage-blob==12.17.0 \
        xarray==2023.7.0 \
        rioxarray==0.15.0 \
        pyproj==3.6.0 \
        shapely==2.0.1
"""
dockerfile_file = io.BytesIO(dockerfile.encode())

In [25]:
dockerfile_file.name = 'dockerfile'

In [26]:
registry_url = 'registry.eo4ph.geoanalytics.ca/tutorial'
image_version_tag = '0.1.1'
docker_image_tag = f'{registry_url}/snow-cover:{image_version_tag}'

In [27]:
docker_client.images.build(fileobj=dockerfile_file, tag=docker_image_tag)

(<Image: 'registry.eo4ph.geoanalytics.ca/tutorial/snow-cover:0.1.1'>,
 <itertools._tee at 0x7fe2473de340>)

In [None]:
docker_client.login(
    username=os.getenv('JUPYTERHUB_USER'),
    password=getpass.getpass(),
    registry=registry_url
)

In [None]:
docker_client.images.push(docker_image_tag)

## Workflow Setup

In [39]:
# Import the required libraries
import os

os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd

from typing import List, Dict, Tuple, Any

from shapely.geometry import Polygon, MultiPolygon

from hera.shared import global_config
from hera.workflows import Artifact, Container, Steps, Workflow, Task, script, DAG, Resources, Parameter
from hera.workflows.models import ValueFrom

In [40]:
global_config.api_version = "argoproj.io/v1"
global_config.host = os.getenv("WORKFLOW_HOST")

In [41]:
@script(
    image=docker_image_tag,
    outputs=Artifact(name='geojson_out', path='/tmp/aoi.geojson'),
    resources=Resources(cpu_request='100m', memory_request='512Mi')
)
def generate_polygon_artifact(
        url: str,
        target_province: str
    ):
    import io
    import os
    import sys
    import json
    import requests
    import zipfile

    import geopandas as gpd
    
    from glob import glob
    from shapely.geometry import Polygon, MultiPolygon

    os.makedirs('/tmp/canvec')
    
    res = requests.get(url)
    with zipfile.ZipFile(io.BytesIO(res.content), 'r') as zf:
        zf.extractall(path='/tmp/canvec')

    shp_file = glob('/tmp/canvec/**/*geo_pol*_2.shp')[0]
    gdf = gpd.read_file(shp_file)

    # Pull out Adminitrative Boundaries
    tgt_col = 'juri_en'
    bc_all = gdf[gdf[tgt_col] == target_province]
    mp = MultiPolygon([poly for poly in bc_all.geometry.values.tolist() if isinstance(poly, Polygon)])
    bc = gpd.GeoDataFrame(data={'geom': [mp]})
    bc = bc.set_geometry('geom')
    bc.to_file('/tmp/aoi.geojson', driver='GeoJSON')
    # json.dump(bc.geometry.values[0].wkt, sys.stdout)
    

In [42]:
@script(
    image=docker_image_tag,
    resources=Resources(cpu_request='100m', memory_request='128Mi')
)
def fan_out_temporal_range(
        catalog_url: str,
        collection: str,
        assets: List[str],
        years: List[int],
        months: List[int],
        epsg: int,
        resolution: int
    ):
    import json
    import sys
    from datetime import datetime

    for year in years:
        if year < 2000:
            raise ValueError(f'MODIS collection starts at 02/24/2000: Error {start_year}')
        elif year > int(datetime.now().strftime('%Y')):
            raise ValueError(f'Cannot query future: Error {end_year}')
        else:
            pass

    for month in months:
        if month < 1 or month > 12:
            raise ValueError(f'Month range is from 1 to 12, inclusive: Error: {start_month}')

    temporal_range = []
    for year in years:
        for month in months:
            if month < 10:
                month = f'0{month}'

            payload = {
                'catalog_url': catalog_url,
                'collection': collection,
                'assets': assets,
                'year': year,
                'month': month,
                'epsg': epsg,
                'resolution': resolution
            }
            temporal_range.append(payload)
    json.dump(temporal_range, sys.stdout)

In [43]:
@script(
    image=docker_image_tag,
    resources=Resources(cpu_request=2, memory_request="8Gi"),
    node_selector={'ga.nodepool': 'a8mv2'},
    inputs=Artifact(name='geojson_in', path='/tmp/aoi.geojson'),
    outputs=Artifact(name='processed-tile', path='/tmp/processed-month.tif')
)
def process_month(payload: Dict[str, Any]):
    import xarray
    import pyproj
    import odc.stac
    import rioxarray
    import pystac_client
    import planetary_computer
    
    import numpy as np
    import geopandas as gpd
    
    from pathlib import Path
    from shapely.ops import transform
    
    gdf = gpd.read_file('/tmp/aoi.geojson')
    payload.update({
        'geom': gdf.geometry.values[0]
    })
    
    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )

    query_time_range = f"{payload['year']}-{payload['month']}"
    query = catalog.search(
        collections=payload['collection'],
        intersects=payload['geom'].envelope,
        datetime=query_time_range
    )
    
    data = odc.stac.load(
        query.items(),
        crs=f"EPSG:{payload['epsg']}",
        bbox=payload['geom'].bounds,
        bands=payload['assets'],
        resolution=int(payload['resolution']),
    )
    snow_data = data['NDSI_Snow_Cover']
    snow_data = snow_data.where(data <= 100, other=np.nan)

    snow_median_data = snow_data.median(
                            dim='time',
                            skipna=True,
                            keep_attrs=True
                        )
    snow_median_data = snow_median_data.chunk('auto')
    
    wgs84_poly = payload['geom']
    wgs84 = pyproj.CRS('EPSG:4326')
    utm = pyproj.CRS(f"EPSG:{payload['epsg']}")
    projection_op = pyproj.Transformer.from_crs(wgs84, utm, always_xy=True).transform
    utm_poly = transform(projection_op, wgs84_poly)
    
    snow_median_data_clipped = snow_median_data.rio.clip(
                                    utm_poly,
                                    all_touched=True,
                                    drop=True
                                )
    
    snow_median_data_clipped.name = query_time_range
    snow_median_data_clipped.attrs['long_name'] = query_time_range
    snow_median_data_clipped.rio.to_raster('/tmp/processed-month.tif')
    
    

    

In [44]:
@script(
    image=docker_image_tag,
    inputs=Artifact(name='processed-aoi', path='/tmp/processed-aoi.tif')
)
def write_raster_to_cloud_storage():
    pass

### NDSI_Snow_Cover Value Map

- 0–100: NDSI snow cover
- 200: missing data
- 201: no decision
- 211: night
- 237: inland water
- 239: ocean
- 250: cloud
- 254: detector saturated
- 255: fill

## Main Workflow

In [45]:
# Workflow Configuration Section

canada_boundaries_endpoint = 'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/canvec/shp/Admin/canvec_15M_CA_Admin_shp.zip'
target_province = 'British Columbia'

catalog_url = 'https://planetarycomputer.microsoft.com/api/stac/v1'
collection = 'modis-10A1-061'
assets = ['NDSI_Snow_Cover']
years = [2018, 2019, 2020]
months = [10, 11, 12, 1, 2, 3, 4]
epsg = 3005
resolution = 500

In [46]:
# Workflow 
with Workflow(
    name="british-columbia-monthly-snow-average",
    namespace=os.getenv("WORKFLOW_NS"),
    entrypoint="snowflow",
    parallelism=100,
)as wf1:    
    with DAG(name='snowflow'):
        a = generate_polygon_artifact(
                arguments={
                    'url': canada_boundaries_endpoint,
                    'target_province': target_province
                }
        )
        b = fan_out_temporal_range(
                arguments={
                    'catalog_url': catalog_url,
                    'collection': collection,
                    'assets': assets,                               
                    'years': years,
                    'months': months,
                    'epsg': epsg,
                    'resolution': resolution
                }
            )
        c = process_month(
            arguments=a.get_artifact('geojson_out').as_name('geojson_in'),
            with_param=b.result
        )
        # d = write_raster_to_cloud_storage(
        #     arguments=c.get_artifact('processed-tile').as_name('processed-aoi')
        # )
        
        [a, b] >> c

In [47]:
wf1.create()

Workflow(api_version=None, kind=None, metadata=ObjectMeta(annotations=None, cluster_name=None, creation_timestamp=Time(__root__=datetime.datetime(2023, 8, 22, 19, 0, 42, tzinfo=datetime.timezone.utc)), deletion_grace_period_seconds=None, deletion_timestamp=None, finalizers=None, generate_name=None, generation=1, labels={'workflows.argoproj.io/creator': 'system-serviceaccount-pipeline-pipeline-argo-workflows-server'}, managed_fields=[ManagedFieldsEntry(api_version='argoproj.io/v1alpha1', fields_type='FieldsV1', fields_v1=FieldsV1(), manager='argo', operation='Update', subresource=None, time=Time(__root__=datetime.datetime(2023, 8, 22, 19, 0, 42, tzinfo=datetime.timezone.utc)))], name='british-columbia-monthly-snow-average', namespace='pipeline', owner_references=None, resource_version='285616914', self_link=None, uid='c8faf740-3646-4064-af61-c9dd915f84d6'), spec=WorkflowSpec(active_deadline_seconds=None, affinity=None, archive_logs=None, arguments=Arguments(artifacts=None, parameters=No