# 06 User Defined Functions (UDFs)

- Define a UDF
- Apply the UDF to the values of column of a dataframe

In [1]:
# 1. Create a spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
# 2. Create a dataframe from a python list and a schema

from datetime import date

students_schema = 'id long, name string, enrolment_date date, gpa double'
students_data = [(1, "Nikos Zikos", date(2025, 9, 1), 7.7),
                 (2, "Maria Pappa", date(2025, 9, 2), 6.7),
                 (3, "Petros Kokkinos", date(2025, 9, 1), 4.6)]
students_df = spark.createDataFrame(students_data, students_schema)
students_df.show()

+---+---------------+--------------+---+
| id|           name|enrolment_date|gpa|
+---+---------------+--------------+---+
|  1|    Nikos Zikos|    2025-09-01|7.7|
|  2|    Maria Pappa|    2025-09-02|6.7|
|  3|Petros Kokkinos|    2025-09-01|4.6|
+---+---------------+--------------+---+



In [None]:
# 3. Create a UDF that when applied to a float value returns either PASS when the value is >=5 and FAIL otherwise

from pyspark.sql.functions import udf

@udf(returnType="string")
def pass_or_fail_udf(status: float):
    return 'PASS' if status >= 5 else 'FAIL'

result_df = students_df.withColumn("gpa_status", pass_or_fail_udf("gpa"))
result_df.drop("gpa").show()

+---+---------------+--------------+----------+
| id|           name|enrolment_date|gpa_status|
+---+---------------+--------------+----------+
|  1|    Nikos Zikos|    2025-09-01|      PASS|
|  2|    Maria Pappa|    2025-09-02|      PASS|
|  3|Petros Kokkinos|    2025-09-01|      FAIL|
+---+---------------+--------------+----------+

