### Spark SQL functions

The `spark.sql.functions` module contains a collection of builtin functions which are handy for operating on columns.
These functions typically take in a column name (string) as an argument - e.g. `to_date("my_date_column_name")`

- API docs: http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions

In [2]:
df = spark.read.json("../data/people/names.json")
df.show()

+----+------+------+-------+
| AGE|gender|height|   name|
+----+------+------+-------+
|  36|female|   180|    Zoe|
|  23|female|   165|  Alice|
|  30|  male|   175|   Andy|
|  25|female|   170|   Jane|
|null|  male|   165|Michael|
|  19|  male|   180| Justin|
+----+------+------+-------+



In [3]:
from pyspark.sql.functions import count

df.select(count('AGE')).show()
df.select(count('height')).show()

+----------+
|count(AGE)|
+----------+
|         5|
+----------+

+-------------+
|count(height)|
+-------------+
|            6|
+-------------+



In [4]:
from pyspark.sql.functions import countDistinct

df.select(countDistinct('height')).show() # there are 4 unique height values

+----------------------+
|count(DISTINCT height)|
+----------------------+
|                     4|
+----------------------+



In [5]:
from pyspark.sql.functions import max, min, avg

df.select(max('height')).show() 
df.select(max(df.height)).show() # 2 ways of doing the same thing

+-----------+
|max(height)|
+-----------+
|        180|
+-----------+

+-----------+
|max(height)|
+-----------+
|        180|
+-----------+



In [6]:
from pyspark.sql.functions import upper

df.select(upper(df.name)).show() 

+-----------+
|upper(name)|
+-----------+
|        ZOE|
|      ALICE|
|       ANDY|
|       JANE|
|    MICHAEL|
|     JUSTIN|
+-----------+



In [7]:
from pyspark.sql.functions import split

df.select(split('gender', 'al')).show() 

+-----------------+
|split(gender, al)|
+-----------------+
|         [fem, e]|
|         [fem, e]|
|           [m, e]|
|         [fem, e]|
|           [m, e]|
|           [m, e]|
+-----------------+



### Built-in aggregate functions

In [10]:
from pyspark.sql.functions import avg

df.select(avg('age')).show()

# this produces the same output:
# df.select('age').groupBy().avg().show()

+--------+
|avg(age)|
+--------+
|    26.6|
+--------+



In [11]:
df.select(avg('age'), avg('height')).show()

# this produces the same output:
# df.groupBy().avg().show()

+--------+-----------+
|avg(age)|avg(height)|
+--------+-----------+
|    26.6|      172.5|
+--------+-----------+

