# Chapter 8 - Joins

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("chapter7").getOrCreate()

In [2]:
person = spark.createDataFrame([(0, 'Bill Chambers', 0, [100]), 
                               (1, 'Matei Zaharia', 1, [500, 250, 100]), 
                               (2, 'Michael Armbrust', 1, [250, 10])])\
                                .toDF('id', 'name', 'graduate_program', 'spark_status')
    
graduateProgram = spark.createDataFrame([(0, 'Masters', 'SOI', 'UC Berkeley'), 
                                        (2, 'Masters', 'EECS', 'UC Berkeley'), 
                                        (1, 'PhD', 'EECS', 'UC Berkeley')])\
                                        .toDF('id', 'degree', 'department', 'school')
    
sparkStatus = spark.createDataFrame([(500, 'Vice President'), (250, 'PMC Member'), (100, 'Contributor')])\
                                    .toDF('id', 'status')

In [3]:
person.show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|      [250, 10]|
+---+----------------+----------------+---------------+



In [4]:
graduateProgram.show()

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  0|Masters|       SOI|UC Berkeley|
|  2|Masters|      EECS|UC Berkeley|
|  1|    PhD|      EECS|UC Berkeley|
+---+-------+----------+-----------+



In [5]:
sparkStatus.show()

+---+--------------+
| id|        status|
+---+--------------+
|500|Vice President|
|250|    PMC Member|
|100|   Contributor|
+---+--------------+



## Inner joins

In [6]:
joinExpression = person["graduate_program"] == graduateProgram['id']

In [7]:
person.join(graduateProgram, joinExpression).show()

+---+----------------+----------------+---------------+---+-------+----------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|department|     school|
+---+----------------+----------------+---------------+---+-------+----------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|       SOI|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|    PhD|      EECS|UC Berkeley|
|  2|Michael Armbrust|               1|      [250, 10]|  1|    PhD|      EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+----------+-----------+



In [8]:
## that would not work, no overlap
wrongJoinExpression = person['name'] == graduateProgram['school']

In [9]:
person.join(graduateProgram, wrongJoinExpression).show()

+---+----+----------------+------------+---+------+----------+------+
| id|name|graduate_program|spark_status| id|degree|department|school|
+---+----+----------------+------------+---+------+----------+------+
+---+----+----------------+------------+---+------+----------+------+



In [10]:
# joinType is a third parameter
joinType = "inner"
person.join(graduateProgram, joinExpression, joinType).show()

+---+----------------+----------------+---------------+---+-------+----------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|department|     school|
+---+----------------+----------------+---------------+---+-------+----------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|       SOI|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|    PhD|      EECS|UC Berkeley|
|  2|Michael Armbrust|               1|      [250, 10]|  1|    PhD|      EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+----------+-----------+



## Outer joins

In [11]:
joinType = "outer"
person.join(graduateProgram, joinExpression, joinType).show()

+----+----------------+----------------+---------------+---+-------+----------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|department|     school|
+----+----------------+----------------+---------------+---+-------+----------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|       SOI|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|    PhD|      EECS|UC Berkeley|
|   2|Michael Armbrust|               1|      [250, 10]|  1|    PhD|      EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|      EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+----------+-----------+



## Left Outer joins

In [12]:
jointType = "left_outer"
graduateProgram.join(person, joinExpression, joinType).show()

+---+-------+----------+-----------+----+----------------+----------------+---------------+
| id| degree|department|     school|  id|            name|graduate_program|   spark_status|
+---+-------+----------+-----------+----+----------------+----------------+---------------+
|  0|Masters|       SOI|UC Berkeley|   0|   Bill Chambers|               0|          [100]|
|  1|    PhD|      EECS|UC Berkeley|   1|   Matei Zaharia|               1|[500, 250, 100]|
|  1|    PhD|      EECS|UC Berkeley|   2|Michael Armbrust|               1|      [250, 10]|
|  2|Masters|      EECS|UC Berkeley|null|            null|            null|           null|
+---+-------+----------+-----------+----+----------------+----------------+---------------+



## Right Outer joins

In [13]:
joinType = "right_outer"
person.join(graduateProgram, joinExpression, joinType).show()

+----+----------------+----------------+---------------+---+-------+----------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|department|     school|
+----+----------------+----------------+---------------+---+-------+----------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|       SOI|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|    PhD|      EECS|UC Berkeley|
|   2|Michael Armbrust|               1|      [250, 10]|  1|    PhD|      EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|      EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+----------+-----------+



## Left Semi joins
It is a left join, but then it does not include any info from the second dataframe. Better understood as filters rather than joins.

In [14]:
joinType = "left_semi"
graduateProgram.join(person, joinExpression, joinType).show()

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  0|Masters|       SOI|UC Berkeley|
|  1|    PhD|      EECS|UC Berkeley|
+---+-------+----------+-----------+



In [15]:
gradProgram2 = graduateProgram.union(spark.createDataFrame([(0, "Masters", "Duplicated Row", "Duplicated Schoold")]))
gradProgram2.join(person, joinExpression, joinType).show()

+---+-------+--------------+------------------+
| id| degree|    department|            school|
+---+-------+--------------+------------------+
|  0|Masters|           SOI|       UC Berkeley|
|  0|Masters|Duplicated Row|Duplicated Schoold|
|  1|    PhD|          EECS|       UC Berkeley|
+---+-------+--------------+------------------+



## Left Anti joins
Opposite of left semi joins. They do not include any info of the right dataframe, they filter by excluding the expression

In [16]:
joinType = "left_anti"
graduateProgram.join(person, joinExpression, joinType).show()

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  2|Masters|      EECS|UC Berkeley|
+---+-------+----------+-----------+



## Cross (Cartesian) joins

Careful! This will cause an absolute explosion in the number of rows contained in the resulting DataFrame.

In [17]:
# this will not work, has to be explicit
joinType = "cross"
graduateProgram.join(person, joinExpression, joinType).show()

+---+-------+----------+-----------+---+----------------+----------------+---------------+
| id| degree|department|     school| id|            name|graduate_program|   spark_status|
+---+-------+----------+-----------+---+----------------+----------------+---------------+
|  0|Masters|       SOI|UC Berkeley|  0|   Bill Chambers|               0|          [100]|
|  1|    PhD|      EECS|UC Berkeley|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  1|    PhD|      EECS|UC Berkeley|  2|Michael Armbrust|               1|      [250, 10]|
+---+-------+----------+-----------+---+----------------+----------------+---------------+



In [18]:
person.crossJoin(graduateProgram).show()

+---+----------------+----------------+---------------+---+-------+----------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|department|     school|
+---+----------------+----------------+---------------+---+-------+----------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|       SOI|UC Berkeley|
|  0|   Bill Chambers|               0|          [100]|  2|Masters|      EECS|UC Berkeley|
|  0|   Bill Chambers|               0|          [100]|  1|    PhD|      EECS|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  0|Masters|       SOI|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  2|Masters|      EECS|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|    PhD|      EECS|UC Berkeley|
|  2|Michael Armbrust|               1|      [250, 10]|  0|Masters|       SOI|UC Berkeley|
|  2|Michael Armbrust|               1|      [250, 10]|  2|Masters|      EECS|UC Berkeley|

## Joins on complex types

Any expression is a valid join expression, assuming it returns a boolean

In [19]:
from pyspark.sql.functions import expr

person.withColumnRenamed("id", "personId").join(sparkStatus, expr("array_contains(spark_status, id)")).show()

+--------+----------------+----------------+---------------+---+--------------+
|personId|            name|graduate_program|   spark_status| id|        status|
+--------+----------------+----------------+---------------+---+--------------+
|       0|   Bill Chambers|               0|          [100]|100|   Contributor|
|       1|   Matei Zaharia|               1|[500, 250, 100]|500|Vice President|
|       1|   Matei Zaharia|               1|[500, 250, 100]|250|    PMC Member|
|       1|   Matei Zaharia|               1|[500, 250, 100]|100|   Contributor|
|       2|Michael Armbrust|               1|      [250, 10]|250|    PMC Member|
+--------+----------------+----------------+---------------+---+--------------+



### Handling duplicate column names


In [20]:
gradProgramDupe = graduateProgram.withColumnRenamed("id", "graduate_program")
joinExpression = gradProgramDupe['graduate_program'] == person['graduate_program']
person.join(gradProgramDupe, joinExpression).show()

# that would create two columns with the same name!

+---+----------------+----------------+---------------+----------------+-------+----------+-----------+
| id|            name|graduate_program|   spark_status|graduate_program| degree|department|     school|
+---+----------------+----------------+---------------+----------------+-------+----------+-----------+
|  0|   Bill Chambers|               0|          [100]|               0|Masters|       SOI|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|               1|    PhD|      EECS|UC Berkeley|
|  2|Michael Armbrust|               1|      [250, 10]|               1|    PhD|      EECS|UC Berkeley|
+---+----------------+----------------+---------------+----------------+-------+----------+-----------+



In [21]:
# Approach 1: different join expression
person.join(gradProgramDupe, "graduate_program").show()

+----------------+---+----------------+---------------+-------+----------+-----------+
|graduate_program| id|            name|   spark_status| degree|department|     school|
+----------------+---+----------------+---------------+-------+----------+-----------+
|               0|  0|   Bill Chambers|          [100]|Masters|       SOI|UC Berkeley|
|               1|  1|   Matei Zaharia|[500, 250, 100]|    PhD|      EECS|UC Berkeley|
|               1|  2|Michael Armbrust|      [250, 10]|    PhD|      EECS|UC Berkeley|
+----------------+---+----------------+---------------+-------+----------+-----------+



In [22]:
# Approach 2: Dropping the column after the join
person.join(gradProgramDupe, joinExpression).drop(person.graduate_program).show()

+---+----------------+---------------+----------------+-------+----------+-----------+
| id|            name|   spark_status|graduate_program| degree|department|     school|
+---+----------------+---------------+----------------+-------+----------+-----------+
|  0|   Bill Chambers|          [100]|               0|Masters|       SOI|UC Berkeley|
|  1|   Matei Zaharia|[500, 250, 100]|               1|    PhD|      EECS|UC Berkeley|
|  2|Michael Armbrust|      [250, 10]|               1|    PhD|      EECS|UC Berkeley|
+---+----------------+---------------+----------------+-------+----------+-----------+



In [23]:
# Approach 3: Renaming a column before the join
gradProgram3 = graduateProgram.withColumnRenamed("id", "grad_id")
joinExpression = person.graduate_program == gradProgram3.grad_id
person.join(gradProgram3, joinExpression).show()

+---+----------------+----------------+---------------+-------+-------+----------+-----------+
| id|            name|graduate_program|   spark_status|grad_id| degree|department|     school|
+---+----------------+----------------+---------------+-------+-------+----------+-----------+
|  0|   Bill Chambers|               0|          [100]|      0|Masters|       SOI|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|      1|    PhD|      EECS|UC Berkeley|
|  2|Michael Armbrust|               1|      [250, 10]|      1|    PhD|      EECS|UC Berkeley|
+---+----------------+----------------+---------------+-------+-------+----------+-----------+

