- Author: Ben Du
- Date: 2020-09-05 14:56:47
- Title: Arithmetic Functions and Operations in Spark
- Slug: spark-dataframe-func-arithmetic
- Category: Computer Science
- Tags: programming, Spark, DataFrame, arithmetic, Spark SQL, functions, modulus

In [2]:
import pandas as pd

In [3]:
from pathlib import Path
import findspark
findspark.init(str(next(Path("/opt").glob("spark-3*"))))

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import StructType
spark = SparkSession.builder.appName("PySpark_Str_Func") \
    .enableHiveSupport().getOrCreate()

In [4]:
df = spark.createDataFrame(pd.DataFrame(data=[
    (5.0, 11),
    (8.0, 20),
    (1.0, -33)
], columns=["x", "y"]))
df.show()

+---+---+
|  x|  y|
+---+---+
|5.0| 11|
|8.0| 20|
|1.0|-33|
+---+---+



## Arithmatic

In [5]:
df.withColumn("z",
    col("y") + 1             
).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11| 12|
|8.0| 20| 21|
|1.0|-33|-32|
+---+---+---+



In [6]:
df.withColumn("z",
    col("y") - 1             
).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11| 10|
|8.0| 20| 19|
|1.0|-33|-34|
+---+---+---+



In [7]:
df.withColumn("z",
    col("y") * 2             
).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11| 22|
|8.0| 20| 40|
|1.0|-33|-66|
+---+---+---+



In [8]:
df.withColumn("z",
    col("y") / 2             
).show()

+---+---+-----+
|  x|  y|    z|
+---+---+-----+
|5.0| 11|  5.5|
|8.0| 20| 10.0|
|1.0|-33|-16.5|
+---+---+-----+



In [9]:
df.withColumn("z",
    col("y") % 2             
).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33| -1|
+---+---+---+



In [10]:
df.withColumn("z",
    pmod("y", 2)             
).show()

NameError: name 'pmod' is not defined

In [13]:
df.withColumn("new_col",
    1 - $"value"             
).show

<console>: 103

In [15]:
df.withColumn("new_col",
    -$"value" + 1       
).show

+-----+-------+
|value|new_col|
+-----+-------+
|  0.1|    0.9|
|  0.2|    0.8|
|  0.3|    0.7|
|  0.4|    0.6|
|  0.5|    0.5|
+-----+-------+



In [16]:
df.withColumn("new_col",
    log(-$"value" + 1)    
).show

+-----+--------------------+
|value|             new_col|
+-----+--------------------+
|  0.1|-0.10536051565782628|
|  0.2| -0.2231435513142097|
|  0.3|-0.35667494393873245|
|  0.4| -0.5108256237659907|
|  0.5| -0.6931471805599453|
+-----+--------------------+



## Operators

### Modulus (%)

In [3]:
val df = Seq(
    (1L, "a", "foo", 3.0),
    (2L, "b", "bar", 4.0),
    (3L, "c", "foo", 5.0),
    (4L, "d", "bar", 7.0)
).toDF("col1", "col2", "col3", "col4")
df.show

+----+----+----+----+
|col1|col2|col3|col4|
+----+----+----+----+
|   1|   a| foo| 3.0|
|   2|   b| bar| 4.0|
|   3|   c| foo| 5.0|
|   4|   d| bar| 7.0|
+----+----+----+----+



null

In [5]:
df.select($"col1" % 2).show

+----------+
|(col1 % 2)|
+----------+
|         1|
|         0|
|         1|
|         0|
+----------+



## References 

https://obstkel.com/spark-sql-functions

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/sql/functions.html