# Parallel

In [1]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, Integer

import pandas as pd
from sklearn import datasets

@node(type="pandas")
def node_0():
    return pd.DataFrame(data = {'col1': ['val1']})


@node(type="pandas",
     dependencies=[node_0])
def node_1(node_0):
    return node_0
    
@node(type="pandas",
     dependencies=[node_0])
def node_2(node_0):
    return node_0

@node(type="pandas",
     dependencies=[node_0])
def node_3(node_0):
    return node_0


@node(type="pandas",
     dependencies=[
         node_1,
         node_2,
         node_3
     ])
def node_4(node_1, node_2, node_3):
    return node_1
    
node_4.run()

html_width=600
html_height=300
displayHTML(node_4.html(width=html_width, height=html_height))

In [2]:
df = spark.createDataFrame(data=[("LEMON", "Yellow",), ("LIME", "Green",)], schema=["_FRUIT", "Color"])
display(df)

df.createOrReplaceTempView("table")

                                                                                

_FRUIT,Color
LEMON,Yellow
LIME,Green


In [3]:
from flypipe import node
from flypipe import node_function
from flypipe.datasource.spark import Spark
from flypipe.schema import Schema, Column
from flypipe.schema.types import String
import pyspark.sql.functions as F

renames = {
    "_FRUIT": "fruit",
    "Color": "color"
}

@node_function(requested_columns=True)
def rename(requested_columns):
    
    raw_columns_queried = [raw_col for raw_col, new_col in renames.items() if new_col in requested_columns]
    print(f"raw columns to be queried: {raw_columns_queried}")
    
    @node(
        type="pyspark",
        dependencies=[
            Spark("table").select(raw_columns_queried)
        ],
        output=Schema([
         Column(col, String(), col) for col in requested_columns   
        ])
    )
    def rename(table):
        for raw_col, new_col in renames.items():
            if new_col in requested_columns:
                print(f"renaming column `{raw_col}` to `{new_col}`")
                table = table.withColumnRenamed(raw_col, new_col)
        
        return table
    
    return rename

@node_function(requested_columns=True)
def lower(requested_columns):
    last_transformation = rename
    
    for requested_column in requested_columns:
        @node(
            type="pyspark",
            dependencies=[
                last_transformation.select(requested_columns).alias("df")
            ],
            output=Schema([
             Column(col, String(), col) for col in requested_columns   
            ])
        )
        def transformation(df):
            return df.withColumn(requested_column, F.lower(requested_column))
        print(f"lower_{requested_column}")
        transformation.function.__name__ = f"lower_{requested_column}"
        last_transformation = transformation
        
    
    return last_transformation




@node(
    type="pyspark",
    dependencies=[lower.select("fruit", "color")]
)
def t1(lower):
    return lower

df = t1.run(spark)
display(df)   

           
html_width=800
html_height=300
displayHTML(t1.html(width=html_width, height=html_height))
        

lower_color
lower_fruit
raw columns to be queried: ['_FRUIT', 'Color']
renaming column `_FRUIT` to `fruit`
renaming column `Color` to `color`


color,fruit
Yellow,lemon
Green,lime


lower_color
lower_fruit
raw columns to be queried: ['_FRUIT', 'Color']
