In [4]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark

In [5]:
status_map = {"M": "Married", "S": "Single"}

data = [
    (1,"Kim","M",74),
    (2,"Jan","S",93),
    (3,"Dawn","S",60),
    (4,"Lee","M",46),
    (5,"Peter","M",60),
    (6,"George","S",80)
]

In [7]:
# creating broadcast variable and storing the information in broadcast_status variable
broadcast_status = sc.broadcast(status_map)

 
df = spark.createDataFrame(data, schema=["id","name","status","marks"])
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- status: string (nullable = true)
 |-- marks: long (nullable = true)

+---+------+------+-----+
| id|  name|status|marks|
+---+------+------+-----+
|  1|   Kim|     M|   74|
|  2|   Jan|     S|   93|
|  3|  Dawn|     S|   60|
|  4|   Lee|     M|   46|
|  5| Peter|     M|   60|
|  6|George|     S|   80|
+---+------+------+-----+



In [8]:
broadcast_status.value["M"]

'Married'

In [9]:
from pyspark.sql.functions import col, udf
 

def convert_status(char):
    return broadcast_status.value[char]
 
convert_status = udf(convert_status)
   

df_2 = df.withColumn("status_info", convert_status(col("status")))
df_2.show()

+---+------+------+-----+-----------+
| id|  name|status|marks|status_info|
+---+------+------+-----+-----------+
|  1|   Kim|     M|   74|    Married|
|  2|   Jan|     S|   93|     Single|
|  3|  Dawn|     S|   60|     Single|
|  4|   Lee|     M|   46|    Married|
|  5| Peter|     M|   60|    Married|
|  6|George|     S|   80|     Single|
+---+------+------+-----+-----------+

