<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/04-joins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Joins
- inner join
- left/right join
- full join
- left anti join
- cartesian product
- union/unionAll
- minus
- intersect

# Setting up PySpark

In [None]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [3]:
employee_data = [("101", "Chloe", 3),
            ("102", "Paul", 1),
            ("103", "John", 1),
            ("104", "Lisa", 2),
            ("105", "Evan", 3),
            ("106", "Amy", 3),
            ("107", "Jimmy", 5)]
dpto_data = [("1", "Engineering"), ("2", "Sales"), ("3", "Marketing"), ("4", "Finance")]

employee_columns = ["id", "name", "dpto"]
dpto_columns = ["dpto", "deptname"]

employee = sc.parallelize(employee_data).toDF(employee_columns)
dpto = sc.parallelize(dpto_data).toDF(dpto_columns)

In [None]:
# Employee's dtaframe
employee.show()

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|102| Paul|   1|
|103| John|   1|
|104| Lisa|   2|
|105| Evan|   3|
|106|  Amy|   3|
|107|Jimmy|   5|
+---+-----+----+



In [None]:
# Department's dtaframe
dpto.show()

+----+-----------+
|dpto|   deptname|
+----+-----------+
|   1|Engineering|
|   2|      Sales|
|   3|  Marketing|
|   4|    Finance|
+----+-----------+



# Joins

In [None]:
# Inner join - exists on both sides
employee.join(dpto, ["dpto"]).show()
# or
employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "inner").show()

+----+---+-----+-----------+
|dpto| id| name|   deptname|
+----+---+-----+-----------+
|   1|102| Paul|Engineering|
|   1|103| John|Engineering|
|   2|104| Lisa|      Sales|
|   3|101|Chloe|  Marketing|
|   3|105| Evan|  Marketing|
|   3|106|  Amy|  Marketing|
+----+---+-----+-----------+

+---+-----+----+----+-----------+
| id| name|dpto|dpto|   deptname|
+---+-----+----+----+-----------+
|102| Paul|   1|   1|Engineering|
|103| John|   1|   1|Engineering|
|104| Lisa|   2|   2|      Sales|
|101|Chloe|   3|   3|  Marketing|
|105| Evan|   3|   3|  Marketing|
|106|  Amy|   3|   3|  Marketing|
+---+-----+----+----+-----------+



In [None]:
# Left join - bring everything from the left side + what exists on right side

employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "left").show()

+---+-----+----+----+-----------+
| id| name|dpto|dpto|   deptname|
+---+-----+----+----+-----------+
|107|Jimmy|   5|NULL|       NULL|
|102| Paul|   1|   1|Engineering|
|103| John|   1|   1|Engineering|
|101|Chloe|   3|   3|  Marketing|
|105| Evan|   3|   3|  Marketing|
|106|  Amy|   3|   3|  Marketing|
|104| Lisa|   2|   2|      Sales|
+---+-----+----+----+-----------+



In [None]:
# Right join - bring everything from the right side + what exists on left side

employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "right").show()

+----+-----+----+----+-----------+
|  id| name|dpto|dpto|   deptname|
+----+-----+----+----+-----------+
| 103| John|   1|   1|Engineering|
| 102| Paul|   1|   1|Engineering|
| 106|  Amy|   3|   3|  Marketing|
| 105| Evan|   3|   3|  Marketing|
| 101|Chloe|   3|   3|  Marketing|
| 104| Lisa|   2|   2|      Sales|
|NULL| NULL|NULL|   4|    Finance|
+----+-----+----+----+-----------+



In [None]:
# Full join - bring everything from both side

employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "full").show()

+----+-----+----+----+-----------+
|  id| name|dpto|dpto|   deptname|
+----+-----+----+----+-----------+
| 102| Paul|   1|   1|Engineering|
| 103| John|   1|   1|Engineering|
| 104| Lisa|   2|   2|      Sales|
| 101|Chloe|   3|   3|  Marketing|
| 105| Evan|   3|   3|  Marketing|
| 106|  Amy|   3|   3|  Marketing|
|NULL| NULL|NULL|   4|    Finance|
| 107|Jimmy|   5|NULL|       NULL|
+----+-----+----+----+-----------+



In [None]:
# left Anti Join - Bring everything from the left that don´t exist on the right
employee.join(dpto, employee["dpto"] == dpto["dpto"], how = "left_anti").show()

# right Anti Join - Bring everything from the right that don´t exist on the left
dpto.join(employee, employee["dpto"] == dpto["dpto"], how = "left_anti").show()

+---+-----+----+
| id| name|dpto|
+---+-----+----+
|107|Jimmy|   5|
+---+-----+----+

+----+--------+
|dpto|deptname|
+----+--------+
|   4| Finance|
+----+--------+



### Using SQL

In [None]:
employee.createOrReplaceTempView("employee")
dpto.createOrReplaceTempView("dpto")

# bring all the employees with or without department
spark.sql("select * from employee left join dpto using (dpto)").show()

+----+---+-----+-----------+
|dpto| id| name|   deptname|
+----+---+-----+-----------+
|   5|107|Jimmy|       NULL|
|   1|102| Paul|Engineering|
|   1|103| John|Engineering|
|   3|101|Chloe|  Marketing|
|   3|105| Evan|  Marketing|
|   3|106|  Amy|  Marketing|
|   2|104| Lisa|      Sales|
+----+---+-----+-----------+



# Union / Minus / Intersect

In [8]:
employee_hr = employee

data = [(200, "George", 5), (201, "Anna", 5), (202, "Carl", 3), (101, "Chloe", 3), (103, "John", 1), (106, "Amy", 1)]
employee_erp = spark.createDataFrame(data, schema=["id", "name", "dpto"])

print("HR database")
employee_hr.show()

print("ERP database")
employee_erp.show()

HR database
+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|102| Paul|   1|
|103| John|   1|
|104| Lisa|   2|
|105| Evan|   3|
|106|  Amy|   3|
|107|Jimmy|   5|
+---+-----+----+

ERP database
+---+------+----+
| id|  name|dpto|
+---+------+----+
|200|George|   5|
|201|  Anna|   5|
|202|  Carl|   3|
|101| Chloe|   3|
|103|  John|   1|
|106|   Amy|   1|
+---+------+----+



In [18]:
# union / unionByName
print("Combine both dataframes")
employee_hr.unionByName(employee_erp).show() # union, unionAll

print("Get values that are common in both dataframes")
employee_hr.intersect(employee_erp).show()

print("Get only the difference - does not exist on the second dataframe")
employee_hr.exceptAll(employee_erp).show()

Combine both dataframes
+---+------+----+
| id|  name|dpto|
+---+------+----+
|101| Chloe|   3|
|102|  Paul|   1|
|103|  John|   1|
|104|  Lisa|   2|
|105|  Evan|   3|
|106|   Amy|   3|
|107| Jimmy|   5|
|200|George|   5|
|201|  Anna|   5|
|202|  Carl|   3|
|101| Chloe|   3|
|103|  John|   1|
|106|   Amy|   1|
+---+------+----+

Get values that are common in both dataframes
+---+-----+----+
| id| name|dpto|
+---+-----+----+
|101|Chloe|   3|
|103| John|   1|
+---+-----+----+

Get only the difference
+---+-----+----+
| id| name|dpto|
+---+-----+----+
|106|  Amy|   3|
|105| Evan|   3|
|104| Lisa|   2|
|102| Paul|   1|
|107|Jimmy|   5|
+---+-----+----+



# Questions

In [None]:
# Q1
# Implement Cartesian Product using dataframe and SQL
# Use employee and dpto

In [None]:
# Q2
# Implement "Left Anti Join" using SQL
# Use employee and dpto