# Chapter 5 - User Defined Functions
Christoph Windheuser    
May, 2022   
Python examples of chapter 5 in the book *Learning Spark*


In [1]:
# Import required python spark libraries
import findspark
import pyspark

from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when, concat, lit, avg, desc
from pyspark.sql import SparkSession
from pyspark.sql import Row


In [2]:
# Connect Jupyter Notebook with the Spark application and create Spark Context
findspark.init()
sc = pyspark.SparkContext(appName="chapter_5")


In [3]:
#create a SparkSession

spark = (SparkSession \
         .builder \
         .enableHiveSupport() \
         .config("spark.sql.catalogImplementation","hive") \
         .appName("Chapter_5_Examples") \
         .getOrCreate())


# User-Defined Functions
page 114 ff.

## Spark SQL UDFs

In [None]:
# Define cubed function:
def cubed(s):
    return s * s * s


In [None]:
# Register UDF
spark.udf.register("cubed", cubed, LongType())


In [None]:
# Generate temporary view:
spark.range(1, 9).createOrReplaceTempView("udf_test")

In [None]:
spark.sql("SELECT * FROM udf_test").show(10)

In [None]:
spark.sql("SELECT id, cubed(id) AS id_cubed FROM udf_test").show()

## Pandas_UDF Functions
Page 115 ff.

In [4]:
import pandas as pd
from pyspark.sql.functions import pandas_udf


In [5]:
# Define a pandas functon cubed
def cubed (a: pd.Series) -> pd.Series:
    return a * a * a


In [6]:
# Create a pandas UDF for the cubed function:
cubed_udf = pandas_udf(cubed, returnType=LongType())


In [7]:
# Create a pandas Series 
x = pd.Series([0, 1, 2, 3])
print (x)

0    0
1    1
2    2
3    3
dtype: int64


In [8]:
# Run the cubed function on the pandas series:
print (cubed(x))


0     0
1     1
2     8
3    27
dtype: int64


In [11]:
# Create a Spark DataFrame:
df = spark.range(0, 4)

In [12]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
+---+



In [13]:
# Execute the cube function as a vectorized Pandas UDF function:
df.select("id", cubed_udf(col("id"))).show()


+---+---------+
| id|cubed(id)|
+---+---------+
|  0|        0|
|  1|        1|
|  2|        8|
|  3|       27|
+---+---------+

