In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("udf").getOrCreate()

In [2]:
datas = [
    ("A", "2022-04-16", 31200),
    ("B", "2022-04-17", 41200),
    ("C", "2022-04-11", 31500),
    ("D", "2022-04-12", 21500),
    ("E", "2022-04-13", 51000)
]

In [3]:
columns = ["product", "date", "price"]

In [4]:
df = spark.createDataFrame(data=datas, schema=columns)
df.show()

+-------+----------+-----+
|product|      date|price|
+-------+----------+-----+
|      A|2022-04-16|31200|
|      B|2022-04-17|41200|
|      C|2022-04-11|31500|
|      D|2022-04-12|21500|
|      E|2022-04-13|51000|
+-------+----------+-----+



In [5]:
df.createOrReplaceTempView("product")

In [6]:
spark.sql("select * from product").show()

+-------+----------+-----+
|product|      date|price|
+-------+----------+-----+
|      A|2022-04-16|31200|
|      B|2022-04-17|41200|
|      C|2022-04-11|31500|
|      D|2022-04-12|21500|
|      E|2022-04-13|51000|
+-------+----------+-----+



# UDF
- User Defined function : 사용자 정의 함수
- 쿼리에서 사용할 함수를 개발자가 직접 만들어 줄 수 있는 기능
- 타입을 지정하지 않으면 무조건 String 형식으로 리턴

In [7]:
from pyspark.sql.types import LongType

# 파이썬에서 만든 함수
def squared(n):
    return n * n

spark.udf.register("squared", squared, LongType())

<function __main__.squared(n)>

In [8]:
spark.sql("SELECT price, squared(price) FROM product").show()

+-----+--------------+
|price|squared(price)|
+-----+--------------+
|31200|     973440000|
|41200|    1697440000|
|31500|     992250000|
|21500|     462250000|
|51000|    2601000000|
+-----+--------------+



In [9]:
def read_number(n):
    units = ["", "십", "백", "천", "만"]
    nums = '일이삼사오육칠팔구'
    result = []
    i = 0
    while n > 0:
        n, r = divmod(n, 10)
        if r > 0:
            result.append(nums[r-1]+units[i])
        i += 1
    return "".join(reversed(result))

print(read_number(33000))

삼만삼천


In [10]:
spark.udf.register("read_number", read_number)

<function __main__.read_number(n)>

In [11]:
spark.sql("select price, read_number(price) from product").show()

+-----+------------------+
|price|read_number(price)|
+-----+------------------+
|31200|      삼만일천이백|
|41200|      사만일천이백|
|31500|      삼만일천오백|
|21500|      이만일천오백|
|51000|          오만일천|
+-----+------------------+



In [12]:
def get_weekday(date):
    import calendar
    return calendar.day_name[date.weekday()]

spark.udf.register("get_weekday", get_weekday)

<function __main__.get_weekday(date)>

In [13]:
query = """
select
    product,
    date,
    get_weekday(to_date(date)),
    read_number(price)
from product
"""

spark.sql(query).show()

+-------+----------+--------------------------+------------------+
|product|      date|get_weekday(to_date(date))|read_number(price)|
+-------+----------+--------------------------+------------------+
|      A|2022-04-16|                  Saturday|      삼만일천이백|
|      B|2022-04-17|                    Sunday|      사만일천이백|
|      C|2022-04-11|                    Monday|      삼만일천오백|
|      D|2022-04-12|                   Tuesday|      이만일천오백|
|      E|2022-04-13|                 Wednesday|          오만일천|
+-------+----------+--------------------------+------------------+



In [14]:
spark.stop()