In [None]:
from cf_pipelines.ml import MLPipeline

from datetime import datetime, timedelta
import pandas as pd

# Parametrising pipelines

Sometimes we want to allow parameters to customise the behaviour of our code without having to modify the code itself. To do achieve this, there are two possible options:

## Option 1: Be explicit!

Define your arguments before hand on the pipeline declaration using the `extra_args` keyword:

In [None]:
parametrised_pipeline = MLPipeline(
    "Parametrised",
    location="outputs",
    extra_args={
        "start_date": "2020-01-01",  # Or get from an environment variable
        "end_date": "2021-01-01",
    },
)

Then use said values in your functions as usual

In [None]:
@parametrised_pipeline.data_ingestion
def get_data(*, start_date, end_date):
    start_date = datetime.fromisoformat(start_date)
    end_date = datetime.fromisoformat(end_date)

    span = (end_date - start_date).days

    input_data = pd.DataFrame(
        [(start_date + timedelta(days=day), day) for day in range(span)], columns=["date", "days"]
    )

    return {"input_data.csv": input_data}


@parametrised_pipeline.data_ingestion
def transform(*, input_data):
    input_data["days_times_2"] = input_data["days"] * 2

    return {"transformed_data.csv": input_data}

In [None]:
results = get_data(start_date="2021-01-01", end_date="2022-01-01")
results["input_data"]

In [None]:
parametrised_pipeline.run()

## Option 2: Environment variables

Define the values as environment variables using the `CF_` prefix:

In [None]:
import os

os.environ["CF_START_DATE"] = "2021-01-01"
os.environ["CF_END_DATE"] = "2022-01-01"

In [None]:
parametrised_pipeline = MLPipeline(
    "Parametrised 2",
    location="outputs_2",
)

Then use said values in your functions as usual

In [None]:
@parametrised_pipeline.data_ingestion
def get_data(*, start_date, end_date):
    start_date = datetime.fromisoformat(start_date)
    end_date = datetime.fromisoformat(end_date)

    span = (end_date - start_date).days

    input_data = pd.DataFrame(
        [(start_date + timedelta(days=day), day) for day in range(span)], columns=["date", "days"]
    )

    return {"input_data.csv": input_data}


@parametrised_pipeline.data_ingestion
def transform(*, input_data):
    input_data["days_times_2"] = input_data["days"] * 2

    return {"transformed_data.csv": input_data}

In [None]:
parametrised_pipeline.run()

 > 🚨 Note that enviroment variables take precedence over arguments passed in the constructor