## Create Custom Container From Dockerfile

Create a dockerfile which implements the environment for the Workflow to run in.   
Once built, push to the GEOAnalytics Container Registry to be used in the Workflow.

In [None]:
! pip install --user docker

In [1]:
import os
import io
import getpass
import docker

In [2]:
# DinD (Docker in Docker sidecar is accessible over localhost
docker_client = docker.from_env()
docker_client

<docker.client.DockerClient at 0x7f2f01eb6560>

In [3]:
dockerfile = """
FROM python:3.11-slim
RUN apt update --no-install-recommends \
    >>/dev/null
    
RUN pip install \
        pandas \
        geopandas \
        pystac_client \
        planetary-computer \
        azure-storage-blob
"""
# with open('/tmp/dockerfile', 'w') as f:
#     f.write(dockerfile)
dockerfile_file = io.BytesIO(dockerfile.encode())

In [4]:
dockerfile_file.name = 'dockerfile'

In [5]:
registry_url = 'registry.eo4ph.geoanalytics.ca/tutorial'

In [6]:
docker_client.images.build(fileobj=dockerfile_file, tag=f'{registry_url}/snow-cover:0.1.0')

(<Image: 'registry.eo4ph.geoanalytics.ca/tutorial/snow-cover:0.1.0'>,
 <itertools._tee at 0x7f2f001e6ac0>)

In [12]:
with open('/tmp/dockerfile', 'rb') as d:
    docker_client.images.build(fileobj=d, tag=f'{registry_url}/snow-cover:0.1.0')

In [None]:
docker_client.login(
    username=os.getenv('JUPYTERHUB_USER'),
    password=getpass.getpass(),
    registry=registry_url
)

In [None]:
docker_client.images.push(f'{registry_url}/snow-cover:0.1.0')

## Workflow Setup

In [31]:
# Import the required libraries
import os

os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd

from typing import List, Dict, Tuple, Any

from shapely.geometry import Polygon, MultiPolygon

from hera.shared import global_config
from hera.workflows import Artifact, Container, Steps, Workflow, Task, script, DAG, Resources, Parameter
from hera.workflows.models import ValueFrom

In [32]:
global_config.api_version = "argoproj.io/v1"
global_config.host = os.getenv("WORKFLOW_HOST")

In [68]:
@script(
    image='registry.eo4ph.geoanalytics.ca/tutorial/snow-cover:0.1.0',
    outputs=Artifact(name='geojson_out', path='/tmp/aoi.geojson')
)
def generate_polygon_artifact(
        url: str,
        target_province: str
    ):
    import io
    import os
    import sys
    import json
    import requests
    import zipfile

    import geopandas as gpd
    
    from glob import glob
    from shapely.geometry import Polygon, MultiPolygon

    os.makedirs('/tmp/canvec')
    
    res = requests.get(url)
    with zipfile.ZipFile(io.BytesIO(res.content), 'r') as zf:
        zf.extractall(path='/tmp/canvec')

    shp_file = glob('/tmp/canvec/**/*geo_pol*_2.shp')[0]
    gdf = gpd.read_file(shp_file)

    # Pull out Adminitrative Boundaries
    tgt_col = 'juri_en'
    bc_all = gdf[gdf[tgt_col] == target_province]
    mp = MultiPolygon([poly for poly in bc_all.geometry.values.tolist() if isinstance(poly, Polygon)])
    bc = gpd.GeoDataFrame(data={'geom': [mp]})
    bc = bc.set_geometry('geom')
    bc.to_file('/tmp/aoi.geojson', driver='GeoJSON')
    # json.dump(bc.geometry.values[0].wkt, sys.stdout)
    

In [69]:
@script(
    image='registry.eo4ph.geoanalytics.ca/tutorial/snow-cover:0.1.0'
)
def fan_out_temporal_range(
        catalog_url: str,
        collection: str,
        assets: List[str],
        start_year: int, 
        end_year: int, 
        start_month: int, 
        end_month: int,
        # geom: Polygon | MultiPolygon
    ):
    import json
    import sys
    from datetime import datetime
    
    if start_year < 2000:
        raise ValueError(f'MODIS collection starts at 02/24/2000: Error {start_year}')
    if end_year > int(datetime.now().strftime('%Y')):
        raise ValueError(f'Cannot query future: Error {end_year}')
    if start_month < 1 or end_month > 12:
        raise ValueError(f'Month range is from 1 to 12, inclusive: Error: {start_month}')
    years = [year for year in range(start_year, end_year, 1)]
    months = [month for month in range(start_month, end_month, 1)]
    temporal_range = []
    for year in years:
        for month in months:
            if month < 10:
                month = f'0{month}'

            payload = {
                'catalog_url': catalog_url,
                'collection': collection,
                'assets': assets,
                'year': year,
                'month': month,
                # 'geom': geom
            }
            temporal_range.append(payload)
    json.dump(temporal_range, sys.stdout)

In [86]:
@script(
    image='registry.eo4ph.geoanalytics.ca/tutorial/snow-cover:0.1.0',
    inputs=Artifact(name='geojson_in', path='/tmp/aoi.geojson')
    # outputs=Artifact(name='processed-tile', path='/tmp/processed-month.tif')
)
def process_month(payload: Dict[str, Any]):
    from pathlib import Path
    import geopandas as gpd
    
    # from pystac_client import Client

    # api = Client(catalog_url)
    # query = api.search(
        
    # )

    gdf = gpd.read_file('/tmp/aoi.geojson')
    payload.update({
        'geom': gdf.geometry.values[0]
    })
    print(f'Payload: {payload}')
    

In [87]:
@script(
    image='registry.eo4ph.geoanalytics.ca/tutorial/snow-cover:0.1.0',
    inputs=Artifact(name='processed-aoi', path='/tmp/processed-aoi.tif')
)
def write_raster_to_cloud_storage():
    pass

In [88]:
canada_boundaries = 'https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/canvec/shp/Admin/canvec_15M_CA_Admin_shp.zip'
catalog_url = "https://planetarycomputer.microsoft.com/api/stac/v1"

In [89]:
# Workflow 
with Workflow(
    name="british-columbia-monthly-snow-average",
    namespace=os.getenv("WORKFLOW_NS"),
    entrypoint="snowflow",
    parallelism=100,
)as wf1:    
    with DAG(name='snowflow'):
        a = generate_polygon_artifact(
                arguments={
                    'url': canada_boundaries,
                    'target_province': 'British Columbia'
                }
        )
        b = fan_out_temporal_range(
                arguments={
                    'catalog_url': catalog_url,
                    'collection': 'modis-10A1-061',
                    'assets': ['NDSI'],                               
                    'start_year': 2015, 
                    'end_year': 2020,
                    'start_month': 4,
                    'end_month': 7,
                }
            )
        c = process_month(
            name='process-month',
            arguments=a.get_artifact('geojson_out').as_name('geojson_in'),
            with_param=b.result
        )
        
        [a, b] >> c

In [90]:
wf1.create()

Workflow(api_version=None, kind=None, metadata=ObjectMeta(annotations=None, cluster_name=None, creation_timestamp=Time(__root__=datetime.datetime(2023, 8, 21, 22, 8, 9, tzinfo=datetime.timezone.utc)), deletion_grace_period_seconds=None, deletion_timestamp=None, finalizers=None, generate_name=None, generation=1, labels={'workflows.argoproj.io/creator': 'system-serviceaccount-pipeline-pipeline-argo-workflows-server'}, managed_fields=[ManagedFieldsEntry(api_version='argoproj.io/v1alpha1', fields_type='FieldsV1', fields_v1=FieldsV1(), manager='argo', operation='Update', subresource=None, time=Time(__root__=datetime.datetime(2023, 8, 21, 22, 8, 9, tzinfo=datetime.timezone.utc)))], name='british-columbia-monthly-snow-average', namespace='pipeline', owner_references=None, resource_version='284882310', self_link=None, uid='6c9934a4-b80c-425c-b99c-24b87918fc11'), spec=WorkflowSpec(active_deadline_seconds=None, affinity=None, archive_logs=None, arguments=Arguments(artifacts=None, parameters=None