In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("local[*]")
    .config("spark.executor.memory","512M")
    .getOrCreate()
)

spark

In [3]:
df = spark.read.format("csv").option("inferSchema", True).option("header", True).load("/home/jovyan/data/employee_records.csv")

In [7]:
'''
    I want to transform department_id to department name
        1. Option 1 is to join df with department dataframe, but join will trigger a shuffle
        2. Option 2 is to create a map with ids and names in a variable and transform it with map or udf function
                    option 2 it will deseirialize and serialize the data in order to execute outside of spark
        
'''

# Option 3 is to use a broadcast variable
# Broadcast variables are send to each executor can be used in spark

dep_names = {
    1: "Dept 1",
    2: "Dept 2",
    3: "Dept 3",
    4: "Dept 4",
    5: "Dept 5",
    6: "Dept 6",
    7: "Dept 7",
    8: "Dept 8",
    9: "Dept 9",
    10: "Dept 10"
}

broad_dep_names = spark.sparkContext.broadcast(dep_names)

In [12]:
display(broad_dep_names.value)
type(broad_dep_names)

{1: 'Dept 1',
 2: 'Dept 2',
 3: 'Dept 3',
 4: 'Dept 4',
 5: 'Dept 5',
 6: 'Dept 6',
 7: 'Dept 7',
 8: 'Dept 8',
 9: 'Dept 9',
 10: 'Dept 10'}

pyspark.broadcast.Broadcast

In [17]:
from pyspark.sql.functions import udf,col

# we can use broadcast variables in udfs

@udf
def get_dep_names(dept_id):
    return broad_dep_names.value.get(dept_id)
    
    
df.withColumn("dept_name", get_dep_names(col("department_id"))).show()

+----------+----------+--------------------+-------------------+--------------------+--------------------+------+-------------+---------+
|first_name| last_name|           job_title|                dob|               email|               phone|salary|department_id|dept_name|
+----------+----------+--------------------+-------------------+--------------------+--------------------+------+-------------+---------+
|   Richard|  Morrison|Public relations ...|1973-05-05 00:00:00|melissagarcia@exa...|       (699)525-4827|512653|            8|   Dept 8|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25 00:00:00|   llara@example.net|  (750)846-1602x7458|999836|            7|   Dept 7|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24 00:00:00| jturner@example.net|    873.820.0518x825|131900|           10|  Dept 10|
|      John|    Monroe|        Retail buyer|1968-06-16 00:00:00|  erik33@example.net|    820-813-0557x624|485506|            1|   Dept 1|
|  Michelle|   Elliott|      Air c

In [20]:
'''
    Problem: Calculate the sum of salary of all employees for department 6
             For this problem each executor will calculate the sum of its own data and then the results should send to one executor to calculate the total
             
'''

# Using accumulators each executor will submit the result to the accumulator and we will avoid the shuffle

from pyspark.sql.functions import sum

# this solution will involve shuffle of each executor result
df.where("department_id == 6").groupBy("department_id").agg(sum("salary").cast("long")).show()

+-------------+---------------------------+
|department_id|CAST(sum(salary) AS BIGINT)|
+-------------+---------------------------+
|            6|                50294510721|
+-------------+---------------------------+



In [23]:
# using accumulators

# initialize the accumulator
dep_salary_accum = spark.sparkContext.accumulator(0)

# define the function that will operate for each row
def calculate_dep_salary(department, salary):
    if department == 6:
        dep_salary_accum.add(salary) 

# execute the function in distributed way for each row        
df.foreach(lambda row: calculate_dep_salary(row.department_id, row.salary))



dep_salary_accum.value

50294510721