In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("udf_study").getOrCreate()

In [3]:
transactions = [
    ('찹쌀탕수육+짜장2', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('등심탕수육+크립새우+짜장면', '2021-10-24 11:19:00', 21500, 'KRW'), 
    ('월남 쌈 2인 세트', '2021-07-25 11:12:40', 42000, 'KRW'), 
    ('콩국수+열무비빔국수', '2021-07-10 08:20:00', 21250, 'KRW'), 
    ('장어소금+고추장구이', '2021-07-01 05:36:00', 68700, 'KRW'), 
    ('족발', '2020-08-19 19:04:00', 32000, 'KRW'),  
]

schema = ["name", "datetime", "price", "currency"]

In [4]:
# df
df = spark.createDataFrame(data=transactions, schema=schema)

In [8]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- price: long (nullable = true)
 |-- currency: string (nullable = true)



In [7]:
df.show()

+--------------------------+-------------------+-----+--------+
|                      name|           datetime|price|currency|
+--------------------------+-------------------+-----+--------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|22000|     KRW|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|21500|     KRW|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|       장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
|                      족발|2020-08-19 19:04:00|32000|     KRW|
+--------------------------+-------------------+-----+--------+



In [11]:
df.createOrReplaceTempView("bills")

In [17]:
spark.sql("select * from bills").show()

+--------------------------+-------------------+-----+--------+
|                      name|           datetime|price|currency|
+--------------------------+-------------------+-----+--------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|22000|     KRW|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|21500|     KRW|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|       장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
|                      족발|2020-08-19 19:04:00|32000|     KRW|
+--------------------------+-------------------+-----+--------+



In [18]:
from pyspark.sql.types import LongType
def squared(n):
    return n * n

In [20]:
from pyspark.sql.types import LongType
def squared(n):
    return n * n
spark.udf.register("squared", squared, returnType=LongType())

<function __main__.squared(n)>

In [23]:
spark.sql("select name, squared(price) from bills").printSchema()

root
 |-- name: string (nullable = true)
 |-- squared(price): long (nullable = true)



In [68]:
# 숫자를 한글로 표현하기
def read_number(n):
    units = ["", "십", "백", "천", "만"]
    ko = "일이삼사오육칠팔구"
    
    result = []
    i = 0
    for num in str(n)[::-1]:
        number = int(num)
        if  number > 0:
            result.append(ko[number-1] + units[i])
        i += 1
    return "".join(reversed(result))

In [69]:
read_number(12312)

'일만이천삼백일십이'

In [84]:
def read_number(n):
    units = ["", "십", "백", "천", "만"]
    ko = "일이삼사오육칠팔구"
    
    result = []
    i = 0
    while n > 0:
        n, r = divmod(n, 10)
        if r > 0:
            result.append(ko[r-1] + units[i])

        i += 1
    return "".join(reversed(result)) + "원"

In [85]:
divmod(4403, 10)

(440, 3)

In [86]:
read_number(44000)

'사만사천원'

In [87]:
spark.udf.register("read_number",read_number)

<function __main__.read_number(n)>

In [89]:
spark.sql("select name, datetime, read_number(price) from bills").show()

+--------------------------+-------------------+------------------+
|                      name|           datetime|read_number(price)|
+--------------------------+-------------------+------------------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|        이만이천원|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|    이만일천오백원|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|        사만이천원|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|이만일천이백오십원|
|       장어소금+고추장구이|2021-07-01 05:36:00|    육만팔천칠백원|
|                      족발|2020-08-19 19:04:00|        삼만이천원|
+--------------------------+-------------------+------------------+



In [90]:
def get_weekday(date):
    import calendar
    return calendar.day_name[date.weekday()]

In [91]:
spark.udf.register("get_weekday", get_weekday)

<function __main__.get_weekday(date)>

In [93]:
spark.sql("select name, get_weekday(TO_DATE(datetime)) as week, read_number(price) from bills").show()

+--------------------------+---------+------------------+
|                      name|     week|read_number(price)|
+--------------------------+---------+------------------+
|          찹쌀탕수육+짜장2|   Sunday|        이만이천원|
|등심탕수육+크립새우+짜장면|   Sunday|    이만일천오백원|
|          월남 쌈 2인 세트|   Sunday|        사만이천원|
|       콩국수+열무비빔국수| Saturday|이만일천이백오십원|
|       장어소금+고추장구이| Thursday|    육만팔천칠백원|
|                      족발|Wednesday|        삼만이천원|
+--------------------------+---------+------------------+



# 문자열 제거  UDF

In [28]:
transactions = [
    ('100', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('10,000원', '2021-10-24 11:19:00', 21500, 'KRW'), 
    ('100..0원', '2021-07-25 11:12:40', 42000, 'KRW'), 
    ('', '2021-07-10 08:20:00', 21250, 'KRW'), 
    ('1023', '2021-07-01 05:36:00', 68700, 'KRW'), 
    ('hihi', '2020-08-19 19:04:00', 32000, 'KRW'),
    ('0', '2020-08-19 19:05:00', 55000, 'KRW'),
    (' ', '2020-08-19 10:23:00', 11000, 'KRW')


]

schema = ["price", "datetime", "code", "currency"]

In [29]:
df = spark.createDataFrame(data=transactions, schema=schema)

In [30]:
df.printSchema()

root
 |-- price: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- code: long (nullable = true)
 |-- currency: string (nullable = true)



In [31]:
type(df)

pyspark.sql.dataframe.DataFrame

In [32]:
df.select("price").show()

+--------+
|   price|
+--------+
|     100|
|10,000원|
|100..0원|
|        |
|    1023|
|    hihi|
|       0|
|        |
+--------+



In [33]:
df.createOrReplaceTempView("bills")

In [34]:
spark.sql("select * from bills").show()

+--------+-------------------+-----+--------+
|   price|           datetime| code|currency|
+--------+-------------------+-----+--------+
|     100|2021-11-07 13:20:00|22000|     KRW|
|10,000원|2021-10-24 11:19:00|21500|     KRW|
|100..0원|2021-07-25 11:12:40|42000|     KRW|
|        |2021-07-10 08:20:00|21250|     KRW|
|    1023|2021-07-01 05:36:00|68700|     KRW|
|    hihi|2020-08-19 19:04:00|32000|     KRW|
|       0|2020-08-19 19:05:00|55000|     KRW|
|        |2020-08-19 10:23:00|11000|     KRW|
+--------+-------------------+-----+--------+



In [35]:
spark.sql("select * from bills where price != ' ' ").show()

+--------+-------------------+-----+--------+
|   price|           datetime| code|currency|
+--------+-------------------+-----+--------+
|     100|2021-11-07 13:20:00|22000|     KRW|
|10,000원|2021-10-24 11:19:00|21500|     KRW|
|100..0원|2021-07-25 11:12:40|42000|     KRW|
|        |2021-07-10 08:20:00|21250|     KRW|
|    1023|2021-07-01 05:36:00|68700|     KRW|
|    hihi|2020-08-19 19:04:00|32000|     KRW|
|       0|2020-08-19 19:05:00|55000|     KRW|
+--------+-------------------+-----+--------+



In [16]:
# udf
# 문자열에서 숫자만 추출하는 함수
def remove_str(str):
    import re
    numbers = re.sub(r'[^0-9]', '', str)
    
    if numbers == '':
        return 0
    return int(numbers)

In [22]:
type(remove_str('  '))

int

In [23]:
from pyspark.sql.types import IntegerType

In [40]:
from pyspark.sql.functions import udf

In [44]:
from pyspark.sql.functions import col

In [43]:
convert_udf = udf(lambda z: remove_str(z), returnType=IntegerType())

In [46]:
df.withColumn("price_int", convert_udf(col("price"))).show()

+--------+-------------------+-----+--------+---------+
|   price|           datetime| code|currency|price_int|
+--------+-------------------+-----+--------+---------+
|     100|2021-11-07 13:20:00|22000|     KRW|      100|
|10,000원|2021-10-24 11:19:00|21500|     KRW|    10000|
|100..0원|2021-07-25 11:12:40|42000|     KRW|     1000|
|        |2021-07-10 08:20:00|21250|     KRW|        0|
|    1023|2021-07-01 05:36:00|68700|     KRW|     1023|
|    hihi|2020-08-19 19:04:00|32000|     KRW|        0|
|       0|2020-08-19 19:05:00|55000|     KRW|        0|
|        |2020-08-19 10:23:00|11000|     KRW|        0|
+--------+-------------------+-----+--------+---------+



In [47]:
spark.udf.register("remove_str", remove_str)

<function __main__.remove_str(str)>

In [49]:
df.show()

+--------+-------------------+-----+--------+
|   price|           datetime| code|currency|
+--------+-------------------+-----+--------+
|     100|2021-11-07 13:20:00|22000|     KRW|
|10,000원|2021-10-24 11:19:00|21500|     KRW|
|100..0원|2021-07-25 11:12:40|42000|     KRW|
|        |2021-07-10 08:20:00|21250|     KRW|
|    1023|2021-07-01 05:36:00|68700|     KRW|
|    hihi|2020-08-19 19:04:00|32000|     KRW|
|       0|2020-08-19 19:05:00|55000|     KRW|
|        |2020-08-19 10:23:00|11000|     KRW|
+--------+-------------------+-----+--------+



In [50]:
query = """
SELECT
    price,
    remove_str(price) as price_int 
FROM
    bills
"""

In [51]:
spark.sql(query).show()

+--------+---------+
|   price|price_int|
+--------+---------+
|     100|      100|
|10,000원|    10000|
|100..0원|     1000|
|        |        0|
|    1023|     1023|
|    hihi|        0|
|       0|        0|
|        |        0|
+--------+---------+

