In [4]:
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")

In [5]:
# Key = DepartmentId
# Value = (EmployeeId, EmployeeName)
# EmployeeId is the "primary key"
listEmployees = [(1, (101, 'John')),\
                 (2, (106, 'Amy')),\
                 (3, (103, 'Paul')),\
                 (4, (104, 'Evan')),\
                 (5, (105, 'Chloe'))]

In [6]:
# Key = DepartmentId
# Value = DepartmentName
# DepartmentId is the "primary key"
listDepartments = [(100, 'Other'),\
                  (3, 'Engineering'),\
                  (2, 'Sales'),\
                  (1, 'Marketing')]

In [7]:
employeesRDD = sc.parallelize(listEmployees)

In [8]:
departmentsRDD = sc.parallelize(listDepartments)

In [9]:
# Inner Join/Natural join
resRDD = employeesRDD.join(departmentsRDD)

resRDD.collect()

[(1, ((101, 'John'), 'Marketing')),
 (2, ((106, 'Amy'), 'Sales')),
 (3, ((103, 'Paul'), 'Engineering'))]

In [10]:
# Left outer Join
resRDD = employeesRDD.leftOuterJoin(departmentsRDD)

resRDD.collect()

[(1, ((101, 'John'), 'Marketing')),
 (2, ((106, 'Amy'), 'Sales')),
 (3, ((103, 'Paul'), 'Engineering')),
 (4, ((104, 'Evan'), None)),
 (5, ((105, 'Chloe'), None))]

In [7]:
# Right outer Join
resRDD = employeesRDD.rightOuterJoin(departmentsRDD)

resRDD.collect()

[(1, ((101, 'John'), 'Marketing')),
 (2, ((106, 'Amy'), 'Sales')),
 (3, ((103, 'Paul'), 'Engineering')),
 (100, (None, 'Other'))]

In [8]:
# Full outer Join
resRDD = employeesRDD.fullOuterJoin(departmentsRDD)

resRDD.collect()

[(1, ((101, 'John'), 'Marketing')),
 (2, ((106, 'Amy'), 'Sales')),
 (3, ((103, 'Paul'), 'Engineering')),
 (4, ((104, 'Evan'), None)),
 (100, (None, 'Other')),
 (5, ((105, 'Chloe'), None))]

In [9]:
# Map + Left outer join + filter to implement NOT IN
removeSetRDD = departmentsRDD.map(lambda p : (p[0], "RightTable"))

leftOuterResRDD = employeesRDD.leftOuterJoin(removeSetRDD)

resRDD=leftOuterResRDD.filter(lambda pair: pair[1][1] is None)\
                        .mapValues(lambda v: v[0])

resRDD.collect()

[(4, (104, 'Evan')), (5, (105, 'Chloe'))]

In [10]:
# subtractByKey to implement NOT IN
# This implementation always returns the correct/expected result
resRDD = employeesRDD.subtractByKey(departmentsRDD)

resRDD.collect()

[(4, (104, 'Evan')), (5, (105, 'Chloe'))]