### Library Imports

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from decimal import Decimal

Create a `SparkSession`. No need to create `SparkContext` as you automatically get it as part of the `SparkSession`.

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Exploring Joins") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

sc = spark.sparkContext

In [9]:
sales_df = spark.createDataFrame(
    [
        (1, 1, 1, "order", Decimal('1')),
        (2, 1, 2, "refund", Decimal('1')),
        (3, 2, None, "shipping", Decimal('1')),
    ], ['id', 'shop_id', "order_id", "type", "amount"]
)

sales_df.toPandas()

Unnamed: 0,id,shop_id,order_id,type,amount
0,1,1,1.0,order,1.0
1,2,1,2.0,refund,1.0
2,3,2,,shipping,1.0


In [10]:
orders_df = spark.createDataFrame(
    [
        (1, 1, Decimal("1.00"), Decimal("1.13")), 
        (2, 1, Decimal("2.00"), Decimal("1.13")), 
        (3, 2, Decimal("3.00"), Decimal("1.13")), 
    ], ['order_id', 'shop_id', "price", "tax"]
)

orders_df.toPandas()

Unnamed: 0,order_id,shop_id,price,tax
0,1,1,1.0,1.13
1,2,1,2.0,1.13
2,3,2,3.0,1.13


In [12]:
sales_df \
    .join(orders_df, 'order_id') \
    .withColumn('total_price', F.col("amount") * F.col("tax")) \
    .select('order_id', 'total_price', 'amount', 'tax') \
    .explain()

== Physical Plan ==
*(5) Project [order_id#26L, CheckOverflow((promote_precision(amount#28) * promote_precision(tax#37)), DecimalType(38,6)) AS total_price#68, amount#28, tax#37]
+- *(5) SortMergeJoin [order_id#26L], [order_id#34L], Inner
   :- *(2) Sort [order_id#26L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(order_id#26L, 200)
   :     +- *(1) Project [order_id#26L, amount#28]
   :        +- *(1) Filter isnotnull(order_id#26L)
   :           +- Scan ExistingRDD[id#24L,shop_id#25L,order_id#26L,type#27,amount#28]
   +- *(4) Sort [order_id#34L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(order_id#34L, 200)
         +- *(3) Project [order_id#34L, tax#37]
            +- *(3) Filter isnotnull(order_id#34L)
               +- Scan ExistingRDD[order_id#34L,shop_id#35L,price#36,tax#37]
