# Creating a new session

In [1]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

## Loading data 

In [6]:
# loading the data and assigning the schema.

path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("text").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
                                     "cast(split(value,',') [1] as date) order_date",
                                     "cast(split(value,',') [2] as int) order_id",
                                      "cast(split(value,',') [3] as string) order_status")


## UDF for 1.6 and above

In [12]:
# import the udf functions and the sql types

from pyspark.sql.functions import udf
from pyspark.sql.types import *

#define the function 

def lwSTR(a):
    return a.lower()

#register the function

udf_dict = udf(lwSTR, StringType())

orders_table.select("order_status").show(2)

orders_table.select(udf_dict("order_status").alias("UDF column")).show(2)

+---------------+
|   order_status|
+---------------+
|         CLOSED|
|PENDING_PAYMENT|
+---------------+
only showing top 2 rows

+---------------+
|     UDF column|
+---------------+
|         closed|
|pending_payment|
+---------------+
only showing top 2 rows



# UDF in spark 2.3 

In [15]:
# udf in python 
# using lambda funtion

lwSTR = spark.udf.register("stringLengthString", lambda x: x.lower())

orders_table.select("order_status").show(2)

orders_table.select(lwSTR("order_status").alias("UDF column")).show(2)

+---------------+
|   order_status|
+---------------+
|         CLOSED|
|PENDING_PAYMENT|
+---------------+
only showing top 2 rows

+---------------+
|     UDF column|
+---------------+
|         closed|
|pending_payment|
+---------------+
only showing top 2 rows

