# Engine etl python scaffold

Our engine **must** be coded in this notebook. The platform provides some cool automations to ensure the correct engine life cycle; this is why we are asking you to please use this notebook. Donâ€™t panic! You can find some useful guidelines in the sections below.

## Engine development

First, we import the Emma-SDK and required other packages for engine development.

In [None]:
!pip install enma-sdk-aws

In [None]:
import pandas as pd
import numpy as np
from enma import Dataflow, logs, metrics, parameters, task, condition, ifelse

In [None]:
pd.options.mode.copy_on_write = True

In [None]:
@task(name="create_df")
def task_create_df():
    data = {
        'name': ['John', 'Anna', 'Peter', 'Linda', 'Enma', 'Mark'],
        'age': [28, 24, 35, 32, 15, 41],
        'city': ['London','Madrid',np.nan,'Madrid','Madrid','Madrid'],
        'date': ['2024-04-11','2022-05-25','2022-09-16','2024-12-11','2022-01-05','2022-07-07']
    }
    df = pd.DataFrame(data)
    return df

@task(name="remove_nan")
def task_remove_nan(data):
    data_without_nan=data.dropna()
    logs.add("log_task_remove_nan")
    metrics.add("metric_task_remove_nan", 2)
    return data_without_nan

@task(name="filter_age")
def task_filter_age(data,age):
    data_filtered = data.query("age > " + str(age))
    logs.add("log_task_filter_age")
    metrics.add("metric_task_filter_age", 3)
    return data_filtered

@task(name="transform_date")
def task_transform_date(data):
    # convert the date column into a datetime object
    data['date'] = pd.to_datetime(data['date'])
    # extract the day, month, and year components
    data['day'] = data['date'].dt.day
    data['month'] = data['date'].dt.month
    data['year'] = data['date'].dt.year
    return data

@condition()
def task_choice(choice):
    return choice == 0

@task(name="groupby_year_city")
def task_groupby_year_city(data):
    return data.groupby(["year","city"]).size()

@task(name="groupby_year")
def task_groupby_year(data):
    return data.groupby(["year"]).size()


with Dataflow("my-dummy-etl") as flow:
    # Define Parameters as key,value pairs.  
    age = parameters.add("age", 18)
    choice = parameters.add("choice", 0)
    
    #Define task within the dataflow.
    out_1 = task_create_df()
    
    # Metrics can be defined at dataflow level (as here) or 
    # at task level (like in task_remove_nan or task_filter_age).
    metrics.add("metric_create_df", 1)
    
    out_2 = task_remove_nan(out_1)
    
    out_3 = task_filter_age(out_2, age)
    
    out_4 = task_transform_date(out_3)
    
    # Ifelse condition evaluates the condition defined as task_choice. 
    # If function exection is True executes `task_groupby_year_city` else executes `task_groupby_year`.
    out_5 = ifelse(
        condition=task_choice(choice),
        true_condition=task_groupby_year_city(out_4),
        false_condition=task_groupby_year(out_4)
    )
    
    # Logs can be defined at dataflow level (as here) or 
    # at task level (like in task_remove_nan or task_filter_age).
    logs.add("log_dataflow")

In [None]:
flow.pprint()

In [None]:
flow.run()

In [None]:
flow.register()