- Title: Column Functions and Operators in PySpark
- Slug: spark-col-functions-operators
- Date: 2019-12-18 11:08:55
- Category: Computer Science
- Tags: programming, Scala, Spark, PySpark, DataFrame, column, functions, operators, Python
- Author: Ben Du

In [4]:
from pathlib import Path
import pandas as pd
import findspark
findspark.init(str(next(Path.home().glob("spark-3*"))))
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, StringType, StructType
spark = SparkSession.builder.appName("PySpark UDF").enableHiveSupport().getOrCreate()

## Operators

### Modulus (%)

In [8]:
df = spark.createDataFrame(pd.DataFrame(data=(
    (1, "a", "foo", 3.0, True),
    (2, "b", "bar", 4.0, True),
    (3, "c", "foo", 5.0, False),
    (4, "d", "bar", 7.0, False)
), columns=("col1", "col2", "col3", "col4", "col5")))
df.show()

+----+----+----+----+-----+
|col1|col2|col3|col4| col5|
+----+----+----+----+-----+
|   1|   a| foo| 3.0| true|
|   2|   b| bar| 4.0| true|
|   3|   c| foo| 5.0|false|
|   4|   d| bar| 7.0|false|
+----+----+----+----+-----+



In [9]:
df.select(col("col1") % 2).show()

+----------+
|(col1 % 2)|
+----------+
|         1|
|         0|
|         1|
|         0|
+----------+



In [11]:
df.filter(col("col5")).show()

+----+----+----+----+----+
|col1|col2|col3|col4|col5|
+----+----+----+----+----+
|   1|   a| foo| 3.0|true|
|   2|   b| bar| 4.0|true|
+----+----+----+----+----+



## Not (`~`) for Column Expressions

Use `~` to reverse the values of a boolean column expression.
Notice that you cannot use the `not` keyword in this situation.

In [13]:
df.filter(~ col("col5")).show()

+----+----+----+----+-----+
|col1|col2|col3|col4| col5|
+----+----+----+----+-----+
|   3|   c| foo| 5.0|false|
|   4|   d| bar| 7.0|false|
+----+----+----+----+-----+



## lit

In [4]:
val x = lit(1)

In [5]:
x

1

## when

1. `null` in when condition is considered as false.

In [1]:
import org.apache.spark.sql.functions._

val df = spark.read.json("../data/people.json")
df.show

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



df = [age: bigint, name: string]


[age: bigint, name: string]

`null` in when condition is considered as `false`.

In [3]:
df.select(when($"age" > 20, 1).otherwise(0).alias("gt20")).show

+----+
|gt20|
+----+
|   0|
|   1|
|   0|
+----+



In [5]:
df.select(when($"age" <= 20, 1).otherwise(0).alias("le20")).show

+----+
|le20|
+----+
|   0|
|   0|
|   1|
+----+



In [6]:
df.select(when($"age".isNull, 0).when($"age" > 20 , 100).otherwise(10).alias("age")).show

+---+
|age|
+---+
|  0|
|100|
| 10|
+---+



In [7]:
df.select(when($"age".isNull, 0).alias("age")).show

+----+
| age|
+----+
|   0|
|null|
|null|
+----+



## Arithmatic

In [12]:
val df = Seq(0.1, 0.2, 0.3, 0.4, 0.5).toDF
df.show

+-----+
|value|
+-----+
|  0.1|
|  0.2|
|  0.3|
|  0.4|
|  0.5|
+-----+



null

In [13]:
df.withColumn("new_col",
    1 - $"value"             
).show

<console>: 103

In [14]:
df.withColumn("new_col",
    lit(1) - $"value"             
).show

+-----+-------+
|value|new_col|
+-----+-------+
|  0.1|    0.9|
|  0.2|    0.8|
|  0.3|    0.7|
|  0.4|    0.6|
|  0.5|    0.5|
+-----+-------+



In [15]:
df.withColumn("new_col",
    -$"value" + 1       
).show

+-----+-------+
|value|new_col|
+-----+-------+
|  0.1|    0.9|
|  0.2|    0.8|
|  0.3|    0.7|
|  0.4|    0.6|
|  0.5|    0.5|
+-----+-------+



In [16]:
df.withColumn("new_col",
    log(-$"value" + 1)    
).show

+-----+--------------------+
|value|             new_col|
+-----+--------------------+
|  0.1|-0.10536051565782628|
|  0.2| -0.2231435513142097|
|  0.3|-0.35667494393873245|
|  0.4| -0.5108256237659907|
|  0.5| -0.6931471805599453|
+-----+--------------------+



## element_at

In [5]:
val df = Seq(
    (Array(1, 2), "how"),
    (Array(2, 3), "are"),
    (Array(3, 4), "you")
).toDF("col1", "col2")
df.show

+------+----+
|  col1|col2|
+------+----+
|[1, 2]| how|
|[2, 3]| are|
|[3, 4]| you|
+------+----+



null

In [6]:
df.select(
    element_at($"col1", 1).alias("word")
).show

+----+
|word|
+----+
|   1|
|   2|
|   3|
+----+



In [23]:
val myUDF1 = udf((x: Int) => {
    (x, 1)
})

UserDefinedFunction(<function1>,StructType(StructField(_1,IntegerType,false), StructField(_2,IntegerType,false)),Some(List(IntegerType)))

In [10]:
val df = Seq(
    (1, 3.0),
    (2, 4.0),
    (3, 5.0)
).toDF("col1", "col2")
df.show

+----+----+
|col1|col2|
+----+----+
|   1| 3.0|
|   2| 4.0|
|   3| 5.0|
+----+----+



null

In [24]:
val df1 = df.select(myUDF1($"col1").alias("f1"))
df1.show

+------+
|    f1|
+------+
|[1, 1]|
|[2, 1]|
|[3, 1]|
+------+



null

In [18]:
df1.select("f1.*").show

+---+---+
| _1| _2|
+---+---+
|  1|  1|
|  2|  1|
|  3|  1|
+---+---+



In [39]:
df1.select(
    $"f1._1".alias("v1"),
    $"f1._2".alias("v2")
).show

+---+---+
| v1| v2|
+---+---+
|  1|  1|
|  2|  1|
|  3|  1|
+---+---+



In [25]:
val myUDF2 = udf((x: Int) => {
    Array(x, 1)
})

UserDefinedFunction(<function1>,ArrayType(IntegerType,false),Some(List(IntegerType)))

In [26]:
val df2 = df.select(myUDF2($"col1").alias("f1"))
df2.show

+------+
|    f1|
+------+
|[1, 1]|
|[2, 1]|
|[3, 1]|
+------+



null

In [27]:
df2.schema

[[StructField(f1,ArrayType(IntegerType,false),true)]]

In [35]:
df2.select(
    element_at($"f1", 1).alias("v1"),
    element_at($"f1", 2).alias("v2")
).show

+---+---+
| v1| v2|
+---+---+
|  1|  1|
|  2|  1|
|  3|  1|
+---+---+



## References

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/sql/Dataset.html

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/sql/functions.html

https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/Row.html