# Databricks

Below we have a very simple transformation pipeline setup that shows how Flypipe might be used. Given the names of various fruits, we will do some minor cleaning of the data and add two columns- color and category. 

## Install flypipe

In [None]:
%pip install flypipe

## Create a temporary view representing a table

In [None]:
df = spark.createDataFrame(
    schema=("_fruit",),
    data=[
        ("ORANGE",),
        ("WATERMELON",),
        ("LEMON",),
    ]
)

df.createOrReplaceTempView("table")

display(df)

## Create a graph

In [None]:
from flypipe import node
from flypipe.datasource.spark import Spark
from flypipe.schema import Schema, Column
from flypipe.schema.types import String

import pyspark.sql.functions as F
@node(
    type="pyspark",
    dependencies=[
        Spark("table").select("_fruit").alias("df")
    ],
    output=Schema(
     Column("fruit", String(), "fruit name"),
    )
)
def clean(df):
    df = df.withColumnRenamed('_fruit', 'fruit')
    df = df.withColumn('fruit', F.lower(F.col('fruit')))
    return df



@node(
    type="pyspark",
    dependencies=[
       clean.select("fruit").alias("df")
    ],
    output=Schema(
        Column("fruit", String(), "fruit name"),
        Column("color", String(), "color of the fruit"),
    )
)
def color(df):
    
    replacements = {
        "blackberry": "black",
        "strawberry": "red",
        "orange": "orange",
        "watermelon": "red",
        "lemon": "yellow",
        "plum": "purple",
    }
    
    df = df.withColumn("color", F.col("fruit"))
    df = df.replace(list(replacements.keys()), list(replacements.values()), "color")
    return df



@node(
    type="pyspark",
    dependencies=[
       clean.select("fruit").alias("df")
    ],
    output=Schema(
        Column("fruit", String(), "fruit name"),
        Column("category", String(), "category of the fruit"),
    )
)
def category(df):
    
    replacements = {
        "blackberry": "berry",
        "strawberry": "berry",
        "orange": "citrus",
        "watermelon": "misc",
        "lemon": "citrus",
        "plum": "stonefruit",
    }
    
    df = df.withColumn("category", F.col("fruit"))
    df = df.replace(list(replacements.keys()), list(replacements.values()), "category")
    return df




@node(
    type="pyspark",
    dependencies=[
       color.select("fruit", "color"),
       category.select("fruit", "category")  
    ],
    output=Schema(
        Column("fruit", String(), "fruit description"),
        Column("color", String(), "color of the fruit"),
        Column("category", String(), "category of the fruit"),
    )
)
def fruits(color, category):
    return color.join(category, on="fruit", how="left")


## Execution Graph

In [None]:
displayHTML(fruits.html())

## Running a pipeline

In [None]:
df = fruits.run(spark)
display(df)