In [1]:
import itertools
import os
import uuid
from datetime import datetime
from typing import Tuple, Any, List, Dict

from pyshacl import validate

from common import *

# Pipeline generation algorithm

In [2]:
ontology = get_ontology_graph()

### 1. Obtain Intent Information functions

In [3]:
def get_intent_iri(intent_graph):
    intent_iri_query = f"""
PREFIX dtbox: <{dtbox}>
SELECT ?iri
WHERE {{
    ?iri a dtbox:Intent .
}}
"""
    result = intent_graph.query(intent_iri_query).bindings
    assert len(result) == 1
    return result[0]['iri']


def get_intent_dataset_problem(intent_graph, intent_iri):
    dataset_problem_query = f"""
    PREFIX dtbox: <{dtbox}>
    SELECT ?dataset ?problem
    WHERE {{
        {intent_iri.n3()} a dtbox:Intent .
        {intent_iri.n3()} dtbox:overData ?dataset .
        {intent_iri.n3()} dtbox:tackles ?problem .
    }}
"""
    result = intent_graph.query(dataset_problem_query).bindings[0]
    return result['dataset'], result['problem']


def get_intent_params(intent_graph, intent_iri):
    params_query = f"""
    PREFIX dtbox: <{dtbox}>
    SELECT ?param ?value
    WHERE {{
        {intent_iri.n3()} a dtbox:UserIntent .
        {intent_iri.n3()} dtbox:usingParameter ?param_value .
        ?param_value dtbox:forParameter ?param .
        ?param_value dtbox:has_value ?value .
    }}
"""
    result = intent_graph.query(params_query).bindings
    return result


def get_intent_info(intent_graph, intent_iri=None) -> Tuple[Any, Any, List[Any], Any]:
    if not intent_iri:
        intent_iri = get_intent_iri(intent_graph)

    dataset, problem = get_intent_dataset_problem(intent_graph, intent_iri)
    params = get_intent_params(intent_graph, intent_iri)

    return dataset, problem, params, intent_iri

### 2. Obtain Loader functions

### 3. Obtain Main component dataset

In [4]:
def get_implementation_input_specs(ontology, implementation):
    input_spec_query = f"""
        PREFIX dtbox: <{dtbox}>
        SELECT ?shape
        WHERE {{
            {implementation.n3()} dtbox:specifiesInput ?spec .
            ?spec a dtbox:IOSpec ;
                dtbox:hasTag ?shape ;
                dtbox:has_position ?position .
            ?shape a dtbox:DataTag .
        }}
        ORDER BY ?position
    """
    results = ontology.query(input_spec_query).bindings
    shapes = [flatten_shape(ontology, result['shape']) for result in results]
    return shapes


def get_implementation_output_specs(ontology, implementation):
    output_spec_query = f"""
        PREFIX dtbox: <{dtbox}>
        SELECT ?shape
        WHERE {{
            {implementation.n3()} dtbox:specifiesOutput ?spec .
            ?spec a dtbox:IOSpec ;
                dtbox:hasTag ?shape ;
                dtbox:has_position ?position .
            ?shape a dtbox:DataTag .
        }}
        ORDER BY ?position
    """
    results = ontology.query(output_spec_query).bindings
    shapes = [flatten_shape(ontology, result['shape']) for result in results]
    return shapes


def flatten_shape(graph, shape):
    if (shape, SH['and'], None) in graph:
        subshapes_query = f"""
            PREFIX sh: <{SH}>
            PREFIX rdf: <{RDF}>

            SELECT ?subshape
            WHERE {{
                {shape.n3()} sh:and ?andNode .
                ?andNode rdf:rest*/rdf:first ?subshape .
            }}
        """
        subshapes = graph.query(subshapes_query).bindings

        return [x for subshape in subshapes for x in flatten_shape(graph, subshape['subshape'])]
    else:
        return [shape]


def get_potential_implementations(ontology, problem_iri, intent_parameters=None) -> List[Tuple[Any, List[Any]]]:
    if intent_parameters is None:
        intent_parameters = []
    intent_params_match = [f'dtbox:hasParameter {param.n3()} ;' for param in intent_parameters]
    intent_params_separator = '            \n'
    main_implementation_query = f"""
    PREFIX dtbox: <{dtbox}>
    SELECT ?implementation
    WHERE {{
        ?implementation a dtbox:Implementation ;
            {intent_params_separator.join(intent_params_match)}
            dtbox:implements ?algorithm .
        ?algorithm a dtbox:Algorithm ;
            dtbox:solves ?problem .
        ?problem dtbox:subProblemOf* {problem_iri.n3()} .
        FILTER NOT EXISTS{{
            ?implementation a dtbox:ApplierImplementation.
        }}
    }}
"""
    results = ontology.query(main_implementation_query).bindings
    implementations = [result['implementation'] for result in results]

    implementations_with_shapes = [
        (implementation, get_implementation_input_specs(ontology, implementation))
        for implementation in implementations]

    return implementations_with_shapes


def get_component_implementation(ontology, component):
    implementation_query = f"""
        PREFIX dtbox: <{dtbox}>
        SELECT ?implementation
        WHERE {{
            {component.n3()} dtbox:hasImplementation ?implementation .
        }}
    """
    result = ontology.query(implementation_query).bindings
    assert len(result) == 1
    return result[0]['implementation']


def get_implementation_components(ontology, implementation) -> List[Any]:
    components_query = f"""
        PREFIX dtbox: <{dtbox}>
        SELECT ?component
        WHERE {{
            ?component dtbox:hasImplementation {implementation.n3()} .
        }}
    """
    results = ontology.query(components_query).bindings
    return [result['component'] for result in results]

In [5]:
def find_components_to_satisfy_shape(ontology, shape, only_learners=True):
    implementation_query = f"""
        PREFIX dtbox: <{dtbox}>
        SELECT ?implementation
        WHERE {{
            ?implementation a dtbox:{'Learner' if only_learners else ''}Implementation ;
                dtbox:specifiesOutput ?spec .
            ?spec dtbox:hasTag {shape.n3()} .
        }}
    """
    result = ontology.query(implementation_query).bindings
    implementations = [x['implementation'] for x in result]
    components = [c
                  for implementation in implementations
                  for c in get_implementation_components(ontology, implementation)]
    return components

In [6]:
def identify_data_io(ontology: Graph, ios: List[Any], return_index=False) -> Any:
    for i, io_shapes in enumerate(ios):
        for io_shape in io_shapes:
            if (io_shape, SH.targetClass, dmop.TabularDataset) in ontology:
                return i if return_index else io_shapes


def identify_model_io(ontology: Graph, ios: List[Any], return_index=False) -> Any:
    for i, io_shapes in enumerate(ios):
        for io_shape in io_shapes:
            query = f'''
    PREFIX sh: <{SH}>
    PREFIX rdfs: <{RDFS}>
    PREFIX ddata: <{dd}>

    ASK {{
      {{
        {io_shape.n3()} sh:targetClass ?targetClass .
        ?targetClass rdfs:subClassOf* ddata:Model .
      }}
      UNION
      {{
        {io_shape.n3()} rdfs:subClassOf* ddata:Model .
      }}
    }}
'''
            if ontology.query(query).askAnswer:
                return i if return_index else io_shapes

In [7]:
comp = dabox.term('component-decimal_scaling')
print(f'Component: {comp}')
impl = get_component_implementation(ontology, comp)
print(f'Implementation: {impl}')
specs = get_implementation_output_specs(ontology, impl)
print(f'Specs: {specs}')
model_spec = identify_model_io(ontology, specs)
print(f'Model spec: {model_spec}')

Component: https://diviloper.dev/ontology/ABOX#component-decimal_scaling
Implementation: https://diviloper.dev/ontology/ABOX#implementation-normalizer_(pmml)
Specs: [[rdflib.term.URIRef('https://diviloper.dev/ontology/shapes#NormalizedTabularDatasetShape')], [rdflib.term.URIRef('https://diviloper.dev/ontology/shapes#NormalizerModel')]]
Model spec: [rdflib.term.URIRef('https://diviloper.dev/ontology/shapes#NormalizerModel')]


In [8]:
for s, p, o in ontology.triples((specs[1][0], None, None)):
    print(f'{s} {p} {o}')

https://diviloper.dev/ontology/shapes#NormalizerModel http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/ns/shacl#NodeShape
https://diviloper.dev/ontology/shapes#NormalizerModel http://www.w3.org/1999/02/22-rdf-syntax-ns#type https://diviloper.dev/ontology#DataTag
https://diviloper.dev/ontology/shapes#NormalizerModel http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#Thing
https://diviloper.dev/ontology/shapes#NormalizerModel http://www.w3.org/ns/shacl#targetClass https://diviloper.dev/ontology/Data#NormalizerModel
https://diviloper.dev/ontology/shapes#NormalizerModel http://www.w3.org/2002/07/owl#sameAs https://diviloper.dev/ontology/shapes#NormalizerModel


In [9]:
def satisfies_shape(data_graph, shacl_graph, shape, focus):
    conforms, g, report = validate(data_graph, shacl_graph=shacl_graph, validate_shapes=[shape], focus=focus)
    return conforms


def get_shape_target_class(ontology, shape):
    return ontology.query(f"""
        PREFIX sh: <{SH}>
        SELECT ?targetClass
        WHERE {{
            <{shape}> sh:targetClass ?targetClass .
        }}
    """).bindings[0]['targetClass']

In [10]:
def get_implementation_parameters(ontology, implementation) -> Dict[URIRef, Tuple[Literal, Literal, Literal]]:
    parameters_query = f"""
        PREFIX dtbox: <{dtbox}>
        SELECT ?parameter ?value ?order ?condition
        WHERE {{
            <{implementation}> dtbox:hasParameter ?parameter .
            ?parameter dtbox:hasDefaultValue ?value ;
                       dtbox:has_condition ?condition ;
                       dtbox:has_position ?order .
        }}
        ORDER BY ?order
    """
    results = ontology.query(parameters_query).bindings
    return {param['parameter']: (param['value'], param['order'], param['condition']) for param in results}


def get_component_overriden_parameters(ontology, component) -> Dict[URIRef, Tuple[Literal, Literal, Literal]]:
    parameters_query = f"""
        PREFIX dtbox: <{dtbox}>
        SELECT ?parameter ?value ?position ?condition
        WHERE {{
            {component.n3()} dtbox:overridesParameter ?parameterValue .
            ?parameterValue dtbox:forParameter ?parameter ;
                       dtbox:has_value ?value .
            ?parameter dtbox:has_position ?position ;
                       dtbox:has_condition ?condition .
        }}
    """
    results = ontology.query(parameters_query).bindings
    return {param['parameter']: (param['value'], param['position'], param['condition']) for param in results}


def get_component_parameters(ontology, component) -> Dict[URIRef, Tuple[Literal, Literal, Literal]]:
    implementation = get_component_implementation(ontology, component)
    implementation_params = get_implementation_parameters(ontology, implementation)
    component_params = get_component_overriden_parameters(ontology, component)
    implementation_params.update(component_params)
    return implementation_params

def perform_param_substitution(graph, parameters: Dict[URIRef, Tuple[Literal, Literal, Literal]], inputs):
    keys = list(parameters.keys())
    for param in keys:
        value, order, condition = parameters[param]
        if condition.value is not None and condition.value != '':
            feature_types = get_inputs_feature_types(graph, inputs)
            if condition.value == '$$INTEGER_COLUMN$$' and int not in feature_types:
                parameters.pop(param)
                continue
            if condition.value == '$$STRING_COLUMN$$' and str not in feature_types:
                parameters.pop(param)
                continue
            if condition.value == '$$FLOAT_COLUMN$$' and float not in feature_types:
                parameters.pop(param)
                continue
        if isinstance(value.value, str) and '$$LABEL$$' in value.value:
            new_value = value.replace('$$LABEL$$', f'{get_inputs_label_name(graph, inputs)}')
            parameters[param] = (Literal(new_value), order, condition)
        if isinstance(value.value, str) and '$$NUMERIC_COLUMNS$$' in value.value:
            new_value = value.replace('$$NUMERIC_COLUMNS$$', f'{get_inputs_numeric_columns(graph, inputs)}')
            parameters[param] = (Literal(new_value), order, condition)
        if isinstance(value.value, str) and '$$CSV_PATH$$' in value.value:
            new_value = value.replace('$$CSV_PATH$$', f'{get_csv_path(graph, inputs)}')
            parameters[param] = (Literal(new_value), order, condition)


    return parameters


In [11]:
def add_step(graph, pipeline, task_name, component, parameters, order, previous_task=None, inputs=None,
             outputs=None):
    if outputs is None:
        outputs = []
    if inputs is None:
        inputs = []
    step = dw.term(task_name)
    graph.add((pipeline, dtbox.hasStep, step))
    graph.add((step, RDF.type, dtbox.Step))
    graph.add((step, dtbox.runs, component))
    graph.add((step, dtbox.has_position, Literal(order)))
    for i, input in enumerate(inputs):
        in_node = BNode()
        graph.add((in_node, RDF.type, dtbox.IO))
        graph.add((in_node, dtbox.hasData, input))
        graph.add((in_node, dtbox.has_position, Literal(i)))
        graph.add((step, dtbox.hasInput, in_node))
    for o, output in enumerate(outputs):
        out_node = BNode()
        graph.add((out_node, RDF.type, dtbox.IO))
        graph.add((out_node, dtbox.hasData, output))
        graph.add((out_node, dtbox.has_position, Literal(o)))
        graph.add((step, dtbox.hasOutput, out_node))
    for parameter, (value, _, _) in parameters.items():
        param_value = BNode()
        graph.add((step, dtbox.hasParameterValue, param_value))
        graph.add((param_value, dtbox.forParameter, parameter))
        graph.add((param_value, dtbox.has_value, value))
    if previous_task:
        if isinstance(previous_task, list):
            for previous in previous_task:
                graph.add((previous, dtbox.followedBy, step))
        else:
            graph.add((previous_task, dtbox.followedBy, step))
    return step

In [12]:
def get_component_transformations(ontology, component) -> List:
    transformation_query = f'''
        PREFIX dtbox: <{dtbox}>
        SELECT ?transformation
        WHERE {{
            <{component}> dtbox:hasTransformation ?transformation_list .
            ?transformation_list rdf:rest*/rdf:first ?transformation .
        }}
    '''
    transformations = ontology.query(transformation_query).bindings
    return [x['transformation'] for x in transformations]

In [13]:
def get_inputs_label_name(graph: Graph, inputs: List[URIRef]) -> str:
    data_input = next(i for i in inputs if (i, RDF.type, dmop.TabularDataset) in graph)
    label_query = f"""
        PREFIX rdfs: <{RDFS}>
        PREFIX dmop: <{dmop}>

        SELECT ?label
        WHERE {{
            {data_input.n3()} dmop:hasColumn ?column .
            ?column dmop:isLabel true ;
                    dmop:hasColumnName ?label .

        }}
    """
    return graph.query(label_query).bindings[0]['label'].value

def get_inputs_numeric_columns(graph: Graph, inputs: List[URIRef]) -> str:
    data_input = next(i for i in inputs if (i, RDF.type, dmop.TabularDataset) in graph)
    columns_query = f"""
        PREFIX rdfs: <{RDFS}>
        PREFIX dmop: <{dmop}>

        SELECT ?label
        WHERE {{
            {data_input.n3()} dmop:hasColumn ?column .
            ?column dmop:isFeature true ;
                    dmop:hasDataPrimitiveTypeColumn ?type ;
                    dmop:hasColumnName ?label .
            FILTER(?type IN (dmop:Float, dmop:Int, dmop:Number, dmop:Double, dmop:Long, dmop:Short, dmop:Integer))
        }}
    """
    columns = graph.query(columns_query).bindings
    return ','.join([x['label'].value for x in columns])

def get_csv_path(graph: Graph, inputs: List[URIRef]) -> str:
    data_input = next(i for i in inputs if (i, RDF.type, dmop.TabularDataset) in graph)
    path = next(graph.objects(data_input, dmop.path), True)
    return path.value


def get_inputs_feature_types(graph: Graph, inputs: List[URIRef]) -> set:
    data_input = next(i for i in inputs if (i, RDF.type, dmop.TabularDataset) in graph)
    columns_query = f"""
        PREFIX rdfs: <{RDFS}>
        PREFIX dmop: <{dmop}>

        SELECT ?type
        WHERE {{
            {data_input.n3()} dmop:hasColumn ?column .
            ?column dmop:isFeature true ;
                    dmop:hasDataPrimitiveTypeColumn ?type .
        }}
    """
    columns = graph.query(columns_query).bindings
    mapping = {
        dmop.Float: float,
        dmop.Int: int,
        dmop.Number: float,
        dmop.Double: float,
        dmop.String: str,
    }
    return set([mapping[x['type']] for x in columns])


In [14]:
def copy_subgraph(source_graph: Graph, source_node: URIRef, destination_graph: Graph, destination_node: URIRef,
                  replace_nodes: bool = True):
    visited_nodes = set()
    nodes_to_visit = [source_node]
    mappings = {source_node: destination_node}

    while nodes_to_visit:
        current_node = nodes_to_visit.pop()
        visited_nodes.add(current_node)
        for predicate, object in source_graph.predicate_objects(current_node):
            if predicate == OWL.sameAs:
                continue
            if replace_nodes and isinstance(object, IdentifiedNode):
                if predicate == RDF.type or object in dmop:
                    mappings[object] = object
                else:
                    if object not in visited_nodes:
                        nodes_to_visit.append(object)
                    if object not in mappings:
                        mappings[object] = BNode()
                destination_graph.add((mappings[current_node], predicate, mappings[object]))
            else:
                destination_graph.add((mappings[current_node], predicate, object))


def annotate_io_with_spec(ontology: Graph, workflow_graph: Graph, io: URIRef, io_spec: List[URIRef]):
    for spec in io_spec:
        io_spec_class = next(ontology.objects(spec, SH.targetClass, True), None)
        if io_spec_class is None or (io, RDF.type, io_spec_class) in workflow_graph:
            continue
        workflow_graph.add((io, RDF.type, io_spec_class))


def annotate_ios_with_specs(ontology: Graph, workflow_graph: Graph, io: List[URIRef], specs: List[List[URIRef]]):
    assert len(io) == len(specs), 'Number of IOs and specs must be the same'
    for io, spec in zip(io, specs):
        annotate_io_with_spec(ontology, workflow_graph, io, spec)


def run_copy_transformation(ontology: Graph, workflow_graph: Graph, transformation, inputs, outputs):
    input_index = next(ontology.objects(transformation, dtbox.copy_input, True)).value
    output_index = next(ontology.objects(transformation, dtbox.copy_output, True)).value
    input = inputs[input_index - 1]
    output = outputs[output_index - 1]

    copy_subgraph(workflow_graph, input, workflow_graph, output)


def run_component_transformation(ontology: Graph, workflow_graph: Graph, component, inputs, outputs,
                                 parameters: dict):
    transformations = get_component_transformations(ontology, component)
    for transformation in transformations:
        if (transformation, RDF.type, dtbox.CopyTransformation) in ontology:
            run_copy_transformation(ontology, workflow_graph, transformation, inputs, outputs)
        elif (transformation, RDF.type, dtbox.LoaderTransformation) in ontology:
            continue
        else:
            prefixes = f'''
PREFIX dtbox: <{dtbox}>
PREFIX da: <{da}>
PREFIX rdf: <{RDF}>
PREFIX rdfs: <{RDFS}>
PREFIX owl: <{OWL}>
PREFIX xsd: <{XSD}>
PREFIX dmop: <{dmop}>
'''
            query = next(ontology.objects(transformation, dtbox.transformation_query, True)).value
            query = prefixes + query
            for i in range(len(inputs)):
                query = query.replace(f'$input{i + 1}', f'{inputs[i].n3()}')
            for i in range(len(outputs)):
                query = query.replace(f'$output{i + 1}', f'{outputs[i].n3()}')
            for param, (value, order, _) in parameters.items():
                query = query.replace(f'$param{order + 1}', f'{value.n3()}')
                query = query.replace(f'$parameter{order + 1}', f'{value.n3()}')
            workflow_graph.update(query)

In [15]:
def step_name(workflow_name, task_order, implementation):
    return f'{workflow_name}-step_{task_order}_{implementation.fragment.replace("-", "_")}'

In [16]:
def add_loader_step(ontology, workflow_graph, workflow, dataset_node):
    loader_component = dabox.term('component-csv_local_reader')
    loader_step_name = step_name(workflow_name, 0, loader_component)
    loader_parameters = get_component_parameters(ontology, loader_component)
    loader_parameters = perform_param_substitution(workflow_graph, loader_parameters, [dataset_node])
    return add_step(workflow_graph, workflow, loader_step_name, loader_component, loader_parameters, 0, None, None, [dataset_node])

In [17]:
def build_workflow_train_test(workflow_name, ontology, dataset, main_component, split_component, transformations):
    workflow_graph = get_graph()
    workflow = dw.term(workflow_name)
    workflow_graph.add((workflow, RDF.type, dtbox.Workflow))
    task_order = 0

    dataset_node = dw.term(f'{workflow_name}-original_dataset')

    copy_subgraph(ontology, dataset, workflow_graph, dataset_node)

    loader_step = add_loader_step(ontology, workflow_graph, workflow, dataset_node)
    task_order += 1

    split_step_name = step_name(workflow_name, task_order, split_component)
    split_outputs = [dw[f'{split_step_name}-output_train'], dw[f'{split_step_name}-output_test']]
    split_parameters = get_component_parameters(ontology, split_component)
    split_step = add_step(workflow_graph, workflow,
                          split_step_name,
                          split_component,
                          split_parameters,
                          task_order,
                          loader_step,
                          [dataset_node],
                          split_outputs)
    run_component_transformation(ontology, workflow_graph, split_component,
                                 [dataset_node], split_outputs,
                                 split_parameters)

    task_order += 1

    train_dataset_node = split_outputs[0]
    test_dataset_node = split_outputs[1]

    previous_train_step = split_step
    previous_test_step = split_step

    for train_component in [*transformations, main_component]:
        test_component = next(ontology.objects(train_component, dtbox.hasApplier, True), train_component)
        same = train_component == test_component

        train_step_name = step_name(workflow_name, task_order, train_component)
        test_step_name = step_name(workflow_name, task_order + 1, test_component)

        train_input_specs = get_implementation_input_specs(ontology,
                                                           get_component_implementation(ontology, train_component))
        train_input_data_index = identify_data_io(ontology, train_input_specs, return_index=True)
        train_transformation_inputs = [dw[f'{train_step_name}-input_{i}'] for i in range(len(train_input_specs))]
        train_transformation_inputs[train_input_data_index] = train_dataset_node
        annotate_ios_with_specs(ontology, workflow_graph, train_transformation_inputs,
                                train_input_specs)

        train_output_specs = get_implementation_output_specs(ontology,
                                                             get_component_implementation(ontology, train_component))
        train_output_model_index = identify_model_io(ontology, train_output_specs, return_index=True)
        train_output_data_index = identify_data_io(ontology, train_output_specs, return_index=True)
        train_transformation_outputs = [dw[f'{train_step_name}-output_{i}'] for i in range(len(train_output_specs))]
        annotate_ios_with_specs(ontology, workflow_graph, train_transformation_outputs,
                                train_output_specs)

        train_parameters = get_component_parameters(ontology, train_component)
        train_parameters = perform_param_substitution(workflow_graph, train_parameters, train_transformation_inputs)
        train_step = add_step(workflow_graph, workflow,
                              train_step_name,
                              train_component, train_parameters, task_order, previous_train_step,
                              train_transformation_inputs,
                              train_transformation_outputs)

        previous_train_step = train_step

        run_component_transformation(ontology, workflow_graph, train_component, train_transformation_inputs,
                                     train_transformation_outputs, train_parameters)

        if train_output_data_index is not None:
            train_dataset_node = train_transformation_outputs[train_output_data_index]

        task_order += 1

        test_input_specs = get_implementation_input_specs(ontology,
                                                          get_component_implementation(ontology, test_component))
        test_input_data_index = identify_data_io(ontology, test_input_specs, return_index=True)
        test_input_model_index = identify_model_io(ontology, test_input_specs, return_index=True)
        test_transformation_inputs = [dw[f'{test_step_name}-input_{i}'] for i in range(len(test_input_specs))]
        test_transformation_inputs[test_input_data_index] = test_dataset_node
        test_transformation_inputs[test_input_model_index] = train_transformation_outputs[train_output_model_index]
        annotate_ios_with_specs(ontology, workflow_graph, test_transformation_inputs,
                                test_input_specs)

        test_output_specs = get_implementation_output_specs(ontology,
                                                            get_component_implementation(ontology, test_component))
        test_output_data_index = identify_data_io(ontology, test_output_specs, return_index=True)
        test_transformation_outputs = [dw[f'{test_step_name}-output_{i}'] for i in range(len(test_output_specs))]
        annotate_ios_with_specs(ontology, workflow_graph, test_transformation_outputs,
                                test_output_specs)

        previous_test_steps = [previous_test_step, train_step] if not same else [previous_test_step]
        test_parameters = get_component_parameters(ontology, test_component)
        test_parameters = perform_param_substitution(workflow_graph, test_parameters, test_transformation_inputs)
        test_step = add_step(workflow_graph, workflow,
                             test_step_name,
                             test_component, test_parameters, task_order, previous_test_steps,
                             test_transformation_inputs,
                             test_transformation_outputs)

        run_component_transformation(ontology, workflow_graph, test_component, test_transformation_inputs,
                                     test_transformation_outputs, test_parameters)

        test_dataset_node = test_transformation_outputs[test_output_data_index]
        previous_test_step = test_step
        task_order += 1

    return workflow_graph, workflow

## Algorithm

In [18]:
intent_graph = get_graph()
ins = Namespace('https://diviloper.dev/intent#')
intent_graph.add((ins.DescriptionIntent, RDF.type, dtbox.Intent))
intent_graph.add((ins.DescriptionIntent, dtbox.overData, dd.term('penguins.csv')))
intent_graph.add((ins.DescriptionIntent, dtbox.tackles, dabox.Description))

<Graph identifier=Ne78ebfd90ed34a19b95c38898782e7f2 (<class 'rdflib.graph.Graph'>)>

In [19]:
log = True

In [20]:
dataset, problem, intent_params, intent_iri = get_intent_info(intent_graph)
folder = f'./workflows/{datetime.now().strftime("%Y-%m-%d %H-%M-%S")}/'
if not os.path.exists(folder):
    os.makedirs(folder)

if log:
    print(f'Dataset: {dataset.fragment}')
    print(f'Problem: {problem.fragment}')
    print(f'Intent params: {intent_params}')
    print('-------------------------------------------------')

comps = get_potential_implementations(ontology, problem, [x['param'] for x in intent_params])
components = [
    (c, impl, inputs)
    for impl, inputs in comps
    for c in get_implementation_components(ontology, impl)
]
if log:
    for component, implementation, inputs in components:
        print(f'Component: {component.fragment} ({implementation.fragment})')
        for im_input in inputs:
            print(f'\tInput: {[x.fragment for x in im_input]}')
    print('-------------------------------------------------')

workflow_order = 0

split_components = [
    dabox.term('component-random_absolute_train_test_split'),
    dabox.term('component-random_relative_train_test_split'),
    dabox.term('component-top_k_absolute_train_test_split'),
    dabox.term('component-top_k_relative_train_test_split'),
]

for component, implementation, inputs in components:
    if log:
        print(f'Component: {component.fragment} ({implementation.fragment})')
    shapes_to_satisfy = identify_data_io(ontology, inputs)
    assert shapes_to_satisfy is not None and len(shapes_to_satisfy) > 0
    if log:
        print(f'\tData input: {[x.fragment for x in shapes_to_satisfy]}')

    unsatisfied_shapes = [shape for shape in shapes_to_satisfy if
                          not satisfies_shape(ontology, ontology, shape, dataset)]

    available_transformations = {
        shape: find_components_to_satisfy_shape(ontology, shape, only_learners=True)
        for shape in unsatisfied_shapes
    }

    if log:
        print(f'\tUnsatisfied shapes: ')
        for shape, comps in available_transformations.items():
            print(f'\t\t{shape.fragment}: {[x.fragment for x in comps]}')

    transformation_combinations = list(itertools.product(split_components, *available_transformations.values()))
    # TODO - check if the combination is valid and whether further transformations are needed

    if log:
        print(f'\tTotal combinations: {len(transformation_combinations)}')

    for i, transformation_combination in enumerate(transformation_combinations):
        if log:
            print(
                f'\t\tCombination {i + 1} / {len(transformation_combinations)}: {[x.fragment for x in transformation_combination]}')

        workflow_name = f'workflow_{workflow_order}_{intent_iri.fragment}_{uuid.uuid4()}'.replace('-', '_')
        wg, w = build_workflow_train_test(workflow_name, ontology, dataset, component, transformation_combination[0],
                                          transformation_combination[1:])

        wg.add((w, dtbox.createdFor, intent_iri))
        wg.add((intent_iri, RDF.type, dtbox.Intent))

        if log:
            print(f'\t\tWorkflow {workflow_order}: {w.fragment}')
        wg.serialize(f'{folder}{workflow_name}.ttl', format='turtle')
        workflow_order += 1

Dataset: penguins.csv
Problem: Description
Intent params: []
-------------------------------------------------
Component: component-decision_tree_learner (implementation-decision_tree_learner)
	Input: ['LabeledTabularDatasetShape']
Component: component-hypertangent_svm_learner (implementation-svm_learner)
	Input: ['NormalizedTabularDatasetShape', 'NonNullTabularDatasetShape', 'LabeledTabularDatasetShape', 'NormalizedTabularDatasetShape', 'NonNullTabularDatasetShape']
Component: component-polynomial_svm_learner (implementation-svm_learner)
	Input: ['NormalizedTabularDatasetShape', 'NonNullTabularDatasetShape', 'LabeledTabularDatasetShape', 'NormalizedTabularDatasetShape', 'NonNullTabularDatasetShape']
Component: component-rbf_svm_learner (implementation-svm_learner)
	Input: ['NormalizedTabularDatasetShape', 'NonNullTabularDatasetShape', 'LabeledTabularDatasetShape', 'NormalizedTabularDatasetShape', 'NonNullTabularDatasetShape']
-------------------------------------------------
Component