In [1]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

The ***pyspark.sql.functions.udf()*** function is a very important function. It allows us to transfer a user defined function to a pyspark.sql.functions function which can act on columns of a DataFrame. It makes data framsformation much more flexible.

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf

In [4]:
mtcars=spark.read.csv('mtcars.csv',inferSchema=True,header=True)
mtcars.show(5)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



In [5]:
def disp_by_hp(disp,hp):
    return(disp/hp)

In [7]:
disp_by_hp_udf=udf(disp_by_hp,returnType=FloatType())

In [8]:
all_original_cols = [eval('mtcars.' + x) for x in mtcars.columns]
all_original_cols

[Column<b'model'>,
 Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>]

In [9]:
disp_by_hp_col=disp_by_hp_udf(mtcars.disp,mtcars.hp)
disp_by_hp_col

Column<b'disp_by_hp(disp, hp)'>

In [10]:
cols=all_original_cols+[disp_by_hp_col]
cols

[Column<b'model'>,
 Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>,
 Column<b'disp_by_hp(disp, hp)'>]

In [12]:
mtcars.select(cols).show(5)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+--------------------+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|disp_by_hp(disp, hp)|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+--------------------+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|           1.4545455|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|           1.4545455|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|           1.1612903|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|           2.3454545|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|            2.057143|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+--------------------+
only showing top 5 rows



### Array column that contain disp,hp

In [13]:
def array_disp_hp(disp,hp):
    return [disp,hp]

In [14]:
array_disp_hp_udf=udf(array_disp_hp,returnType=ArrayType(FloatType()))

In [15]:
array_disp_hp_col=array_disp_hp_udf(mtcars.disp,mtcars.hp)
array_disp_hp_col

Column<b'array_disp_hp(disp, hp)'>

In [16]:
cols=all_original_cols+[array_disp_hp_col]
cols

[Column<b'model'>,
 Column<b'mpg'>,
 Column<b'cyl'>,
 Column<b'disp'>,
 Column<b'hp'>,
 Column<b'drat'>,
 Column<b'wt'>,
 Column<b'qsec'>,
 Column<b'vs'>,
 Column<b'am'>,
 Column<b'gear'>,
 Column<b'carb'>,
 Column<b'array_disp_hp(disp, hp)'>]

In [17]:
mtcars.select(cols).show(8)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----------------------+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|array_disp_hp(disp, hp)|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+-----------------------+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|               [160.0,]|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|               [160.0,]|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|               [108.0,]|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|               [258.0,]|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|               [360.0,]|
|          Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|               [225.0,]|
|       Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|               [360.0,]|
|        Merc 240D|2

**We need to use StructType() if we put different type values in the list.**


**The most important thing in *udf* is correctly using the return type in the function**