Merge pull request #55 from getindata/release-0.4.0

Release 0.4.0
getindata · Apr 28, 2023 · 2e5836b · 2e5836b
2 parents c220ac3 + 78c54ff
commit 2e5836b
Show file tree

Hide file tree

Showing 31 changed files with 912 additions and 215 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.6
+current_version = 0.4.0
 
 [bumpversion:file:pyproject.toml]
 

diff --git a/.copier-answers.yml b/.copier-answers.yml
@@ -7,7 +7,7 @@ description: Kedro plugin with Azure ML Pipelines support
 docs_url: https://kedro-azureml.readthedocs.io/
 full_name: Kedro Azure ML Pipelines plugin
 github_url: https://github.com/getindata/kedro-azureml
-initial_version: 0.3.6
+initial_version: 0.4.0
 keywords:
 - kedro
 - mlops

diff --git a/.github/workflows/tests_and_publish.yml b/.github/workflows/tests_and_publish.yml
@@ -103,6 +103,9 @@ jobs:
     if: (github.event.pull_request.head.repo.full_name == github.repository || github.event.pull_request == null)
     needs: [unit_tests, sonarcloud]
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        e2e_config: ["e2e", "e2e_pipeline_data_passing"]
     steps:
       - uses: actions/checkout@v2
 
@@ -123,7 +126,7 @@ jobs:
       - name: Initialize starter project
         run: |
           pip install $(find "./dist" -name "*.tar.gz")
-          kedro new --starter spaceflights --config tests/conf/e2e/starter-config.yml --verbose
+          kedro new --starter spaceflights --config tests/conf/${{ matrix.e2e_config }}/starter-config.yml --verbose
 
       - name: Install starter requirements
         working-directory: ./spaceflights
@@ -143,9 +146,10 @@ jobs:
           cat Dockerfile
           echo "!data/01_raw" >> .dockerignore
           rm conf/base/catalog.yml
-          cp ../tests/conf/e2e/catalog.yml conf/base/catalog.yml
-          cp ../tests/conf/e2e/azureml.yml conf/base/azureml.yml
+          cp ../tests/conf/${{ matrix.e2e_config }}/catalog.yml conf/base/catalog.yml
+          cp ../tests/conf/${{ matrix.e2e_config }}/azureml.yml conf/base/azureml.yml
           sed -i 's/{container_registry}/${{ secrets.REGISTRY_LOGIN_SERVER }}/g' conf/base/azureml.yml
+          sed -i 's/{image_tag}/${{ matrix.e2e_config }}/g' conf/base/azureml.yml
           cat conf/base/azureml.yml
 
       - name: Login via Azure CLI
@@ -163,8 +167,9 @@ jobs:
       - name: Build and push docker image
         working-directory: ./spaceflights
         run: |
-          docker build --build-arg BASE_IMAGE=python:3.10-buster -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/kedro-azureml-e2e:latest .
-          docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/kedro-azureml-e2e:latest
+          docker pull ${{ secrets.REGISTRY_LOGIN_SERVER }}/kedro-azureml-e2e:${{ matrix.e2e_config }} || true
+          docker build --build-arg BASE_IMAGE=python:3.10-buster -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/kedro-azureml-e2e:${{ matrix.e2e_config }} --cache-from=${{ secrets.REGISTRY_LOGIN_SERVER }}/kedro-azureml-e2e:${{ matrix.e2e_config }} .
+          docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/kedro-azureml-e2e:${{ matrix.e2e_config }}
 
       - name: Run on Azure ML Pipelines
         working-directory: ./spaceflights

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 ## [Unreleased]
 
+## [0.4.0] - 2023-04-28
+
+-   [🧑‍🔬 Experimental ] Added support for pipeline-native data passing (allows to preview intermediate data in AzureML Studio UI) by [@tomasvanpottelbergh](https://github.com/tomasvanpottelbergh)
+-   New `AzureMLFileDataSet` and `AzureMLPandasDataSet`by [@asafalinadsg](https://github.com/asafalinadsg) & [@eliorc](https://github.com/eliorc)
+-   E2E tests for `AzureMLPandasDataSet` dataset
+-   Bumped minimal required Kedro version to `0.18.5`
+-   Added support for `OmegaConfigLoader`
+
 ## [0.3.6] - 2023-03-08
 
 ## [0.3.5] - 2023-02-20
@@ -48,7 +56,9 @@
 
 -   Initial plugin release
 
-[Unreleased]: https://github.com/getindata/kedro-azureml/compare/0.3.6...HEAD
+[Unreleased]: https://github.com/getindata/kedro-azureml/compare/0.4.0...HEAD
+
+[0.4.0]: https://github.com/getindata/kedro-azureml/compare/0.3.6...0.4.0
 
 [0.3.6]: https://github.com/getindata/kedro-azureml/compare/0.3.5...0.3.6
 

diff --git a/docs/source/03_quickstart.rst b/docs/source/03_quickstart.rst
@@ -17,6 +17,8 @@ created in Azure and have their **names** ready to input to the plugin:
 -  Azure Resource Group
 -  Azure ML workspace
 -  Azure ML Compute Cluster
+
+Depending on the type of flow you want to use, you will also need:
 -  Azure Storage Account and Storage Container
 -  Azure Storage Key (will be used to execute the pipeline)
 -  Azure Container Registry
@@ -66,6 +68,12 @@ created in Azure and have their **names** ready to input to the plugin:
    #                          STORAGE_CONTAINER ENVIRONMENT_NAME
    kedro azureml init <subscription-id> <resource-group-name> <workspace-name> <experiment-name> <compute-cluster-name> <storage-account-name> <storage-container-name> <environment-name>
 
+If you want to pass data between nodes using the built-in Azure ML
+pipeline data passing, you can use dummy values for the storage account
+and container names. In this case, adjust the ``conf/base/azureml.yml``
+to enable pipeline data passing. See :doc:`04_data_assets` for more
+information about this.
+
 8. Adjust the Data Catalog - the default one stores all data locally,
    whereas the plugin will automatically use Azure Blob Storage. Only
    input data is required to be read locally. Final

diff --git a/docs/source/04_data_assets.rst b/docs/source/04_data_assets.rst
@@ -6,7 +6,12 @@ and the ``AzureMLPandasDataSet`` which translate to `File/Folder dataset`_ and `
 Azure Machine Learning. Both fully support the Azure versioning mechanism and can be used in the same way as any
 other dataset in Kedro.
 
-Both of these can be found under the `kedro_azureml.datasets`_ module.
+Apart from these, ``kedro-azureml`` also adds the ``AzureMLPipelineDataSet`` which is used to pass data between
+pipeline nodes when the pipeline is run on Azure ML and the `pipeline_data_passing` feature is enabled.
+By default, data is then saved and loaded using the ``PickleDataSet`` as underlying dataset.
+Any other underlying dataset can be used instead by adding a ``AzureMLPipelineDataSet`` to the catalog.
+
+All of these can be found under the `kedro_azureml.datasets`_ module.
 
 For details on usage, see the :ref:`API Reference` below
 
@@ -26,3 +31,8 @@ API Reference
 
 .. autoclass:: kedro_azureml.datasets.AzureMLFileDataSet
     :members:
+
+-----------------
+
+.. autoclass:: kedro_azureml.datasets.AzureMLPipelineDataSet
+    :members:
diff --git a/kedro_azureml/__init__.py b/kedro_azureml/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.3.6"
+__version__ = "0.4.0"
 
 import warnings
 

diff --git a/kedro_azureml/cli.py b/kedro_azureml/cli.py
@@ -2,7 +2,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import click
 from kedro.framework.startup import ProjectMetadata
@@ -21,8 +21,9 @@
     KEDRO_AZURE_BLOB_TEMP_DIR_NAME,
 )
 from kedro_azureml.distributed.utils import is_distributed_master_node
+from kedro_azureml.manager import KedroContextManager
 from kedro_azureml.runner import AzurePipelinesRunner
-from kedro_azureml.utils import CliContext, KedroContextManager
+from kedro_azureml.utils import CliContext
 
 logger = logging.getLogger(__name__)
 
@@ -56,9 +57,10 @@ def azureml_group(ctx, metadata: ProjectMetadata, env):
 @click.argument("workspace_name")
 @click.argument("experiment_name")
 @click.argument("cluster_name")
-@click.argument("storage_account_name")
-@click.argument("storage_container")
 @click.argument("environment_name")
+@click.option("-a", "--storage_account_name")
+@click.option("-c", "--storage_container")
+@click.option("--use-pipeline-data-passing", is_flag=True, default=False)
 @click.pass_obj
 def init(
     ctx: CliContext,
@@ -67,13 +69,24 @@ def init(
     workspace_name,
     experiment_name,
     cluster_name,
+    environment_name,
     storage_account_name,
     storage_container,
-    environment_name,
+    use_pipeline_data_passing: bool,
 ):
     """
     Creates basic configuration for Kedro AzureML plugin
     """
+
+    if (
+        not (storage_account_name and storage_container)
+        and not use_pipeline_data_passing
+    ):
+        raise click.UsageError(
+            "You need to specify storage account (-a) and container name (-c) "
+            "or enable pipeline data passing (--use-pipeline-data-passing)"
+        )
+
     target_path = Path.cwd().joinpath("conf/base/azureml.yml")
     cfg = CONFIG_TEMPLATE_YAML.format(
         **{
@@ -82,9 +95,10 @@ def init(
             "workspace_name": workspace_name,
             "experiment_name": experiment_name,
             "cluster_name": cluster_name,
-            "storage_account_name": storage_account_name,
-            "storage_container": storage_container,
+            "storage_account_name": storage_account_name or "~",
+            "storage_container": storage_container or "~",
             "environment_name": environment_name,
+            "pipeline_data_passing": use_pipeline_data_passing,
         }
     )
     target_path.write_text(cfg)
@@ -289,29 +303,51 @@ def compile(
     type=str,
     help="Parameters override in form of `key=value`",
 )
+@click.option(
+    "--az-input",
+    "azure_inputs",
+    type=(str, click.Path(exists=True, file_okay=False, dir_okay=True)),
+    multiple=True,
+    help="Name and path of Azure ML Pipeline input",
+)
 @click.option(
     "--az-output",
     "azure_outputs",
-    type=str,
+    type=(str, click.Path(exists=True, file_okay=False, dir_okay=True)),
     multiple=True,
-    help="Paths of Azure ML Pipeline outputs to save dummy data into",
+    help="Name and path of Azure ML Pipeline output",
 )
 @click.pass_obj
 def execute(
-    ctx: CliContext, pipeline: str, node: str, params: str, azure_outputs: Tuple[str]
+    ctx: CliContext,
+    pipeline: str,
+    node: str,
+    params: str,
+    azure_inputs: List[Tuple[str, str]],
+    azure_outputs: List[Tuple[str, str]],
 ):
     # 1. Run kedro
     parameters = parse_extra_params(params)
+    azure_inputs = {ds_name: data_path for ds_name, data_path in azure_inputs}
+    azure_outputs = {ds_name: data_path for ds_name, data_path in azure_outputs}
+    data_paths = {**azure_inputs, **azure_outputs}
+
     with KedroContextManager(
         ctx.metadata.package_name, env=ctx.env, extra_params=parameters
     ) as mgr:
-        runner = AzurePipelinesRunner()
+        pipeline_data_passing = (
+            mgr.plugin_config.azure.pipeline_data_passing is not None
+            and mgr.plugin_config.azure.pipeline_data_passing.enabled
+        )
+        runner = AzurePipelinesRunner(
+            data_paths=data_paths, pipeline_data_passing=pipeline_data_passing
+        )
         mgr.session.run(pipeline, node_names=[node], runner=runner)
 
     # 2. Save dummy outputs
     # In distributed computing, it will only happen on nodes with rank 0
-    if is_distributed_master_node():
-        for dummy_output in azure_outputs:
-            (Path(dummy_output) / "output.txt").write_text("#getindata")
+    if not pipeline_data_passing and is_distributed_master_node():
+        for data_path in azure_outputs.values():
+            (Path(data_path) / "output.txt").write_text("#getindata")
     else:
         logger.info("Skipping saving Azure outputs on non-master distributed nodes")
diff --git a/kedro_azureml/cli_functions.py b/kedro_azureml/cli_functions.py
@@ -9,7 +9,8 @@
 import click
 
 from kedro_azureml.generator import AzureMLPipelineGenerator
-from kedro_azureml.utils import CliContext, KedroContextManager
+from kedro_azureml.manager import KedroContextManager
+from kedro_azureml.utils import CliContext
 
 logger = logging.getLogger()
 
@@ -27,7 +28,11 @@ def get_context_and_pipeline(
         ctx.metadata.package_name, ctx.env, parse_extra_params(params, True)
     ) as mgr:
         storage_account_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY", "")
-        if not storage_account_key:
+        pipeline_data_passing = (
+            mgr.plugin_config.azure.pipeline_data_passing is not None
+            and mgr.plugin_config.azure.pipeline_data_passing.enabled
+        )
+        if not pipeline_data_passing and not storage_account_key:
             click.echo(
                 click.style(
                     "Environment variable AZURE_STORAGE_ACCOUNT_KEY not set, falling back to CLI prompt",

diff --git a/kedro_azureml/config.py b/kedro_azureml/config.py
@@ -4,6 +4,8 @@
 import yaml
 from pydantic import BaseModel, validator
 
+from kedro_azureml.utils import update_dict
+
 
 class DefaultConfigDict(defaultdict):
     def __getitem__(self, key):
@@ -13,8 +15,8 @@ def __getitem__(self, key):
 
 
 class AzureTempStorageConfig(BaseModel):
-    account_name: str
-    container: str
+    account_name: Optional[str] = None
+    container: Optional[str] = None
 
 
 class ComputeConfig(BaseModel):
@@ -25,6 +27,10 @@ class DockerConfig(BaseModel):
     image: Optional[str] = None
 
 
+class PipelineDataPassingConfig(BaseModel):
+    enabled: bool = False
+
+
 class AzureMLConfig(BaseModel):
     @staticmethod
     def _create_default_dict_with(
@@ -44,10 +50,11 @@ def _validate_compute(cls, value):
     workspace_name: str
     experiment_name: str
     compute: Optional[Dict[str, ComputeConfig]]
-    temporary_storage: AzureTempStorageConfig
+    temporary_storage: Optional[AzureTempStorageConfig]
     environment_name: Optional[str]
     code_directory: Optional[str]
     working_directory: Optional[str]
+    pipeline_data_passing: Optional[PipelineDataPassingConfig] = None
 
 
 class KedroAzureMLConfig(BaseModel):
@@ -79,6 +86,9 @@ class KedroAzureRunnerConfig(BaseModel):
   # Path to the directory in the Docker image to run the code from
   # Ignored when code_directory is set
   working_directory: /home/kedro_docker
+  # Use Azure ML pipeline data passing instead of temporary storage
+  pipeline_data_passing:
+    enabled: {pipeline_data_passing} # disabled by default
 
   # Temporary storage settings - this is used to pass some data between steps
   # if the data is not specified in the catalog directly
@@ -87,9 +97,9 @@ class KedroAzureRunnerConfig(BaseModel):
     # It's recommended to set Lifecycle management rule for storage container, to avoid costs of long-term storage
     # of the temporary data. Temporary data will be stored under abfs://<containter>/kedro-azureml-temp path
     # See https://docs.microsoft.com/en-us/azure/storage/blobs/lifecycle-management-policy-configure?tabs=azure-portal
-    account_name: "{storage_account_name}"
+    account_name: {storage_account_name}
     # Name of the storage container
-    container: "{storage_container}"
+    container: {storage_container}
   compute:
     # Azure compute used for running kedro jobs.
     # Additional compute cluster can be defined here. Individual nodes can reference specific compute clusters by adding
@@ -108,4 +118,11 @@ class KedroAzureRunnerConfig(BaseModel):
 """.strip()
 
 # This auto-validates the template above during import
-_CONFIG_TEMPLATE = KedroAzureMLConfig.parse_obj(yaml.safe_load(CONFIG_TEMPLATE_YAML))
+_CONFIG_TEMPLATE = KedroAzureMLConfig.parse_obj(
+    update_dict(
+        yaml.safe_load(CONFIG_TEMPLATE_YAML),
+        ("azure.pipeline_data_passing.enabled", False),
+        ("azure.temporary_storage.container", ""),
+        ("azure.temporary_storage.account_name", ""),
+    )
+)
diff --git a/kedro_azureml/datasets/__init__.py b/kedro_azureml/datasets/__init__.py
@@ -1,12 +1,14 @@
 from kedro_azureml.datasets.file_dataset import AzureMLFileDataSet
 from kedro_azureml.datasets.pandas_dataset import AzureMLPandasDataSet
+from kedro_azureml.datasets.pipeline_dataset import AzureMLPipelineDataSet
 from kedro_azureml.datasets.runner_dataset import (
     KedroAzureRunnerDataset,
     KedroAzureRunnerDistributedDataset,
 )
 
 __all__ = [
     "AzureMLFileDataSet",
+    "AzureMLPipelineDataSet",
     "AzureMLPandasDataSet",
     "KedroAzureRunnerDataset",
     "KedroAzureRunnerDistributedDataset",

diff --git a/kedro_azureml/datasets/file_dataset.py b/kedro_azureml/datasets/file_dataset.py
@@ -58,7 +58,7 @@ class AzureMLFileDataSet(PartitionedDataSet):
     Example
     -------
 
-    Example of a catalog.yml enry:
+    Example of a catalog.yml entry:
 
     .. code-block:: yaml