In [1]:
import pyspark.sql.functions as sf

In [2]:
from pyspark.sql import SparkSession

if not 'spark' in locals():
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory","4G") \
        .getOrCreate()

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark

### Set Checkpoint directory

First we need to specify a checkpoint directory on a reliable shared file system.

In [19]:
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")

# 1 Fictional Sales Data

In this example we use a fictional data set of company revenues. The special property of this data set is that a company can have a different company as its parent company. Eventually a business expert wants to see the whole revenue of a company including all child companies. This requires that we build up an additional table containing all children (direct and indirect) for every company, such that we can join the revenues against this table and then aggregate over all direct and indirect children for each parent.

Let's start by loading and inspecting the data.

In [20]:
basedir = "s3://dimajix-training/data"
basedir = "file:///home/kaya/Jupyter/Dimajix/pyspark-advanced/data"

In [21]:
data = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv(basedir + "/global-sales.csv")

data.printSchema()

root
 |-- company: integer (nullable = true)
 |-- parent_company: integer (nullable = true)
 |-- company_name: string (nullable = true)
 |-- revenue: integer (nullable = true)



In [22]:
data.toPandas()

Unnamed: 0,company,parent_company,company_name,revenue
0,1,,Global Earth Inc,10000
1,2,1.0,European Markets,0
2,3,2.0,Germany Sales GmbH,2000
3,4,2.0,Spain Products,123000
4,5,2.0,Swiss Made,213000
5,6,2.0,France Superstars,241000
6,7,3.0,Berlin Store,287000
7,8,3.0,Hamburg Store,312000
8,9,3.0,Hessian Store Group,10000
9,10,9.0,Frankfurt Shop Central,287000


# 2 Single Step of transitive parent-child relations

In the next step we want to build the helper table containing all children for every company. We will calculate this table using an iterative algorithm which adds the next level of children in every iteration. We first implement a single iteration, which will add the next level of children to each parent company.

In [23]:
# Remove all records without a parent company for the algorithm
cleaned_df = data \
    .filter(data["parent_company"].isNotNull()) \
    .select(data["company"], data["parent_company"])

In [24]:
def iterate_parent_child(df):
    # Denote the incoming table "parent" and "child", since we will do a self-join and the join condition would be ambigious without aliases otherwise
    parent_df = df.alias("parent")
    child_df = df.alias("child")
    
    # Calculate next levels of indirect children by joining the table to itself and by retrieving the child of each child of each parent
    next_level = parent_df.join(child_df, sf.col("parent.company") == sf.col("child.parent_company"), "inner") \
        .select(sf.col("parent.parent_company"), sf.col("child.company"))

    # Add current relations, otherwise they will be lost
    cur_level = parent_df.select(parent_df["parent_company"], parent_df["company"])
    
    # Return union of next indirection and current relations
    return next_level.union(cur_level).distinct()

### Perform one iteration

Now let us perform a single iteration and inspect the result.

In [25]:
next = iterate_parent_child(cleaned_df)
next.orderBy("parent_company","company").toPandas()

Unnamed: 0,parent_company,company
0,1,2
1,1,3
2,1,4
3,1,5
4,1,6
5,1,13
6,1,14
7,1,15
8,1,16
9,1,17


# 3 Iterative Algorithm

Now that we can add one level of indirection to our table of parent-child relations, we simply need to apply this algorithm as often as new records are created. We also add a reflective relation of each company to itself at the end, such that when using the table for aggregating all children, the revenue of each company itself  will also be added up in addition to its children.

In [26]:
def calc_transitive_children(df):
    # Remove records without a parent
    cleaned_df = data \
        .filter(data["parent_company"].isNotNull()) \
        .select(data["company"], data["parent_company"])
    
    # Iterate as long as new records are created
    cur_df = cleaned_df
    cur_count = cur_df.count()

    while (True):
        next_df = iterate_parent_child(cur_df) #.checkpoint()
        next_count = next_df.count()
        # If no new records are created, we are finished
        if next_count == cur_count:
            break
        # This would be a good place to perform a checkpoint
        cur_df = next_df        
        cur_count = next_count
        
    # Create additional reflective relation of each company to itself
    self_df = data.select(sf.col("company").alias("parent_company"), sf.col("company"))
    
    return self_df.union(cur_df).distinct()

### Run Algorithm

Now let us run the whole algorithm on the original data set and inspect the result.

In [27]:
relations = calc_transitive_children(data)

relations.orderBy("parent_company","company").toPandas()

Unnamed: 0,parent_company,company
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
58,16,16
59,17,17
60,18,18
61,19,19


### Inspect execution plan

In [28]:
relations.explain()

== Physical Plan ==
*(4) HashAggregate(keys=[parent_company#706, company#551], functions=[])
+- Exchange hashpartitioning(parent_company#706, company#551, 200), true, [id=#6643]
   +- *(3) HashAggregate(keys=[parent_company#706, company#551], functions=[])
      +- Union
         :- *(1) Project [company#551 AS parent_company#706, company#551]
         :  +- FileScan csv [company#551] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/kaya/Jupyter/Dimajix/pyspark-advanced/data/global-sales.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<company:int>
         +- *(2) Scan ExistingRDD[parent_company#552,company#645]




# 4 Perform Aggregation

Now let us perform the final aggregation, such that we can calculate the revenue of each company including each direct and indirect child. This can be performed by joining the `relations` data frame to the original `data` data frame and then grouping on the `parent_company` column of the `relations` data frame and adding up the revenue.

In [33]:
hierarchical_revenue = relations \
    .join(data, ["company"]) \
    .groupby(relations["parent_company"]) \
    .agg(sf.sum(sf.col("revenue")).alias("total_revenue"))

In [34]:
hierarchical_revenue.toPandas()

Unnamed: 0,parent_company,total_revenue
0,12,90000
1,1,9492820
2,13,2231000
3,16,2131000
4,6,241000
5,3,1109000
6,20,198000
7,5,213000
8,19,2179820
9,15,197000


### Check Totals

Just to verify the result, let us compare the result of company 1 ("Earth") with a simple sum over all revenues.

In [35]:
totals = data.select(sf.sum(data["revenue"]))
totals.toPandas()

Unnamed: 0,sum(revenue)
0,9492820
