# NOTEBOOK 3.4 Spark File I/O and User-Defined Functions (UDFs)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark = SparkSession.builder.getOrCreate()

25/06/12 09:45:50 WARN Utils: Your hostname, PC25. resolves to a loopback address: 127.0.1.1; using 192.168.76.195 instead (on interface eth0)
25/06/12 09:45:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/12 09:45:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sales_df = spark.read.option("sep", "\t")\
    .option("header", "true")\
    .csv("data/sales.csv")

sales_df.show(3)

+----+-----------+----------+--------+
|code|description|unit_price|quantity|
+----+-----------+----------+--------+
|1005|        pen|       2.5|       4|
|1007|     pencil|       1.0|      10|
|1001|   notebook|       5.0|       2|
+----+-----------+----------+--------+
only showing top 3 rows



# 1. Spark File I/O
- The Hadoop installation in the WSL distro has configured HDFS as the default file system.
- To access files in WSL's local file system, the filepath format will start with **"file://**".
- If files do not have header information in them, you can skip the (header, true) option.

## 1.1 Write Data in Different File Formats to HDFS

### Instruction:
Before running the statements below, create a directory named "temp" in /user/student.

In [3]:
# Writing as a CSV file
sales_df.write.mode("overwrite").option("header", "true").save("temp/sales.csv")

# Writing as a parquet file
sales_df.write.parquet('temp/sales.parquet')

# Writing as a JSON file
sales_df.write.json("temp/sales.json")

# Writing as an ORC file
sales_df.write.orc('temp/sales.orc', mode='overwrite')

## 1.2 Read Data with Different File Formats from HDFS

In [4]:
# Read a parquet file
spark.read.parquet('temp/sales.parquet').show()

+----+-----------+----------+--------+
|code|description|unit_price|quantity|
+----+-----------+----------+--------+
|1005|        pen|       2.5|       4|
|1007|     pencil|       1.0|      10|
|1001|   notebook|       5.0|       2|
|1003|      ruler|       1.0|       1|
|1002| calculator|      55.0|       1|
+----+-----------+----------+--------+



In [5]:
# Read a JSON file
spark.read.json('temp/sales.json').show()

+----+-----------+--------+----------+
|code|description|quantity|unit_price|
+----+-----------+--------+----------+
|1005|        pen|       4|       2.5|
|1007|     pencil|      10|       1.0|
|1001|   notebook|       2|       5.0|
|1003|      ruler|       1|       1.0|
|1002| calculator|       1|      55.0|
+----+-----------+--------+----------+



In [6]:
# Read an ORC file
spark.read.orc('temp/sales.orc').show()

+----+-----------+----------+--------+
|code|description|unit_price|quantity|
+----+-----------+----------+--------+
|1005|        pen|       2.5|       4|
|1007|     pencil|       1.0|      10|
|1001|   notebook|       5.0|       2|
|1003|      ruler|       1.0|       1|
|1002| calculator|      55.0|       1|
+----+-----------+----------+--------+



# 2. Spark User-Defined Functions (UDFs)

In [7]:
sales_df.printSchema()

root
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- unit_price: string (nullable = true)
 |-- quantity: string (nullable = true)



In [8]:
# Cast columns to appropriate types
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, IntegerType

sales_df = sales_df\
.withColumn('unit_price', col('unit_price').cast(DoubleType())) \
.withColumn('quantity', col('quantity').cast(IntegerType()))

sales_df.printSchema()

root
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- quantity: integer (nullable = true)



In [9]:
# Read the discount data
discount_df = spark.read.option("sep", "\t")\
    .option("header", "true")\
    .csv("data/discounts.csv")

discount_df.createOrReplaceTempView('DiscountData')
discount_df = spark.sql("SELECT item_code, DOUBLE(discount_perc) from DiscountData")
discount_df.printSchema()
discount_df.show()

root
 |-- item_code: string (nullable = true)
 |-- discount_perc: double (nullable = true)

+---------+-------------+
|item_code|discount_perc|
+---------+-------------+
|     1005|         20.0|
|     1007|         10.0|
|     1001|         50.0|
|     1003|         15.0|
|     1002|         10.0|
+---------+-------------+



In [10]:
# Join sales_df and discount_df based on the item code
sales_df = sales_df.join(discount_df, sales_df.code == discount_df.item_code, "inner")
sales_df.show()

+----+-----------+----------+--------+---------+-------------+
|code|description|unit_price|quantity|item_code|discount_perc|
+----+-----------+----------+--------+---------+-------------+
|1005|        pen|       2.5|       4|     1005|         20.0|
|1007|     pencil|       1.0|      10|     1007|         10.0|
|1001|   notebook|       5.0|       2|     1001|         50.0|
|1003|      ruler|       1.0|       1|     1003|         15.0|
|1002| calculator|      55.0|       1|     1002|         10.0|
+----+-----------+----------+--------+---------+-------------+



In [11]:
# Drop item_code column
sales_df = sales_df.drop('item_code')
sales_df.show()

+----+-----------+----------+--------+-------------+
|code|description|unit_price|quantity|discount_perc|
+----+-----------+----------+--------+-------------+
|1005|        pen|       2.5|       4|         20.0|
|1007|     pencil|       1.0|      10|         10.0|
|1001|   notebook|       5.0|       2|         50.0|
|1003|      ruler|       1.0|       1|         15.0|
|1002| calculator|      55.0|       1|         10.0|
+----+-----------+----------+--------+-------------+



In [12]:
# Rename code column
sales_df = sales_df.withColumnRenamed('code', 'product_code')
sales_df.show()

+------------+-----------+----------+--------+-------------+
|product_code|description|unit_price|quantity|discount_perc|
+------------+-----------+----------+--------+-------------+
|        1005|        pen|       2.5|       4|         20.0|
|        1007|     pencil|       1.0|      10|         10.0|
|        1001|   notebook|       5.0|       2|         50.0|
|        1003|      ruler|       1.0|       1|         15.0|
|        1002| calculator|      55.0|       1|         10.0|
+------------+-----------+----------+--------+-------------+



## 2.0 Using a PySpark built-in function

In [13]:
from pyspark.sql.functions import upper

sales_df.withColumn('uppercased_description', upper(sales_df.description)).show()

+------------+-----------+----------+--------+-------------+----------------------+
|product_code|description|unit_price|quantity|discount_perc|uppercased_description|
+------------+-----------+----------+--------+-------------+----------------------+
|        1005|        pen|       2.5|       4|         20.0|                   PEN|
|        1007|     pencil|       1.0|      10|         10.0|                PENCIL|
|        1001|   notebook|       5.0|       2|         50.0|              NOTEBOOK|
|        1003|      ruler|       1.0|       1|         15.0|                 RULER|
|        1002| calculator|      55.0|       1|         10.0|            CALCULATOR|
+------------+-----------+----------+--------+-------------+----------------------+



## 2.1 UDF: From a Lambda Expression

In [14]:
# Define a UDF to capitalize a string
capitalize_udf = udf(lambda x: x.upper(), StringType())

In [15]:
# Apply the UDF to a column
sales_df = sales_df.withColumn("capitalized_name", capitalize_udf("description"))

# Display the result
sales_df.show()

+------------+-----------+----------+--------+-------------+----------------+
|product_code|description|unit_price|quantity|discount_perc|capitalized_name|
+------------+-----------+----------+--------+-------------+----------------+
|        1005|        pen|       2.5|       4|         20.0|             PEN|
|        1007|     pencil|       1.0|      10|         10.0|          PENCIL|
|        1001|   notebook|       5.0|       2|         50.0|        NOTEBOOK|
|        1003|      ruler|       1.0|       1|         15.0|           RULER|
|        1002| calculator|      55.0|       1|         10.0|      CALCULATOR|
+------------+-----------+----------+--------+-------------+----------------+



## 2.2 UDF: Registering an exsting function as a UDF

In [16]:
from pyspark.sql.functions import udf

def calculate_price(unit_price, quantity):
    return unit_price * quantity

In [17]:
# UDF registration
calculate_price_udf = udf(calculate_price, DoubleType())

In [18]:
# Use the UDF to compute the total price before discount
sales_df = sales_df.withColumn("original_total", calculate_price_udf('unit_price', 'quantity'))
sales_df.show()

+------------+-----------+----------+--------+-------------+----------------+--------------+
|product_code|description|unit_price|quantity|discount_perc|capitalized_name|original_total|
+------------+-----------+----------+--------+-------------+----------------+--------------+
|        1005|        pen|       2.5|       4|         20.0|             PEN|          10.0|
|        1007|     pencil|       1.0|      10|         10.0|          PENCIL|          10.0|
|        1001|   notebook|       5.0|       2|         50.0|        NOTEBOOK|          10.0|
|        1003|      ruler|       1.0|       1|         15.0|           RULER|           1.0|
|        1002| calculator|      55.0|       1|         10.0|      CALCULATOR|          55.0|
+------------+-----------+----------+--------+-------------+----------------+--------------+



## 2.3 UDF: Using a UDF created using annotations

### IMPORTANT: Ensure that the de_classes subfolder exist in your project folder.
- to use this approach, the methods must be static methods (as indicated with the **@staticmethod** annotation).
- Refer to the **SalesProcessor** class in sales_processor.py: This class contains 2 UDFs that were created using the **@udf** annotation.



In [19]:
# Add a file to be downloaded with the Spark job on every node.
sc = spark.sparkContext
sc.addFile("de_classes/sales_processor.py")

# Import the SalesProcessor class
from sales_processor import SalesProcessor

In [20]:
# Invoke UDF to compute the discounted price
sales_df = sales_df.withColumn("discounted_unit_price", SalesProcessor.calculate_discounted_price('unit_price', 'discount_perc'))
sales_df.show()

+------------+-----------+----------+--------+-------------+----------------+--------------+---------------------+
|product_code|description|unit_price|quantity|discount_perc|capitalized_name|original_total|discounted_unit_price|
+------------+-----------+----------+--------+-------------+----------------+--------------+---------------------+
|        1005|        pen|       2.5|       4|         20.0|             PEN|          10.0|                  2.0|
|        1007|     pencil|       1.0|      10|         10.0|          PENCIL|          10.0|                  0.9|
|        1001|   notebook|       5.0|       2|         50.0|        NOTEBOOK|          10.0|                  2.5|
|        1003|      ruler|       1.0|       1|         15.0|           RULER|           1.0|                 0.85|
|        1002| calculator|      55.0|       1|         10.0|      CALCULATOR|          55.0|                 49.5|
+------------+-----------+----------+--------+-------------+----------------+---

In [21]:
# Compute the discounted total
sales_df = sales_df.withColumn("discounted_total", calculate_price_udf('discounted_unit_price', 'quantity'))
sales_df.show()

+------------+-----------+----------+--------+-------------+----------------+--------------+---------------------+----------------+
|product_code|description|unit_price|quantity|discount_perc|capitalized_name|original_total|discounted_unit_price|discounted_total|
+------------+-----------+----------+--------+-------------+----------------+--------------+---------------------+----------------+
|        1005|        pen|       2.5|       4|         20.0|             PEN|          10.0|                  2.0|             8.0|
|        1007|     pencil|       1.0|      10|         10.0|          PENCIL|          10.0|                  0.9|             9.0|
|        1001|   notebook|       5.0|       2|         50.0|        NOTEBOOK|          10.0|                  2.5|             5.0|
|        1003|      ruler|       1.0|       1|         15.0|           RULER|           1.0|                 0.85|            0.85|
|        1002| calculator|      55.0|       1|         10.0|      CALCULATOR

In [22]:
# Select fewer columns for summary df
sales_summary_df = sales_df.select('product_code', 'description', 'unit_price', 'quantity', 'discount_perc', 'discounted_total')
sales_summary_df.show()

+------------+-----------+----------+--------+-------------+----------------+
|product_code|description|unit_price|quantity|discount_perc|discounted_total|
+------------+-----------+----------+--------+-------------+----------------+
|        1005|        pen|       2.5|       4|         20.0|             8.0|
|        1007|     pencil|       1.0|      10|         10.0|             9.0|
|        1001|   notebook|       5.0|       2|         50.0|             5.0|
|        1003|      ruler|       1.0|       1|         15.0|            0.85|
|        1002| calculator|      55.0|       1|         10.0|            49.5|
+------------+-----------+----------+--------+-------------+----------------+



In [23]:
# Invoke another UDF to format the price
sales_summary_df = sales_summary_df.withColumn('unit_price', SalesProcessor.format_price('unit_price'))\
            .withColumn('discounted_total', SalesProcessor.format_price('discounted_total'))
sales_summary_df.show()

+------------+-----------+----------+--------+-------------+----------------+
|product_code|description|unit_price|quantity|discount_perc|discounted_total|
+------------+-----------+----------+--------+-------------+----------------+
|        1005|        pen|    RM2.50|       4|         20.0|          RM8.00|
|        1007|     pencil|    RM1.00|      10|         10.0|          RM9.00|
|        1001|   notebook|    RM5.00|       2|         50.0|          RM5.00|
|        1003|      ruler|    RM1.00|       1|         15.0|          RM0.85|
|        1002| calculator|   RM55.00|       1|         10.0|         RM49.50|
+------------+-----------+----------+--------+-------------+----------------+



In [24]:
spark.stop()