# Spark SQL

In [1]:
%%spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,,pyspark,idle,,,✔


SparkSession available as 'spark'.


In [2]:
import pandas as pd

In [3]:
df = spark.createDataFrame(pd.DataFrame(dict(a=range(6), b=list('aabbcc'))))

In [4]:
df.show()

+---+---+
|  a|  b|
+---+---+
|  0|  a|
|  1|  a|
|  2|  b|
|  3|  b|
|  4|  c|
|  5|  c|
+---+---+

In [5]:
df.createOrReplaceTempView('df_table')

## Standard SQL Queries

In [24]:
spark.sql('''
SELECT * FROM df_table
''').show()

+---+---+
|  a|  b|
+---+---+
|  0|  a|
|  1|  a|
|  2|  b|
|  3|  b|
|  4|  c|
|  5|  c|
+---+---+

In [93]:
spark.sql('''
SELECT * 
FROM df_table
WHERE b='b'
''').show()

+---+---+
|  a|  b|
+---+---+
|  2|  b|
|  3|  b|
+---+---+

In [91]:
spark.sql('''
SELECT b, mean(a) AS avg, std(a) AS std
FROM df_table
GROUP BY b
ORDER BY b
''').show()

+---+---+------------------+
|  b|avg|               std|
+---+---+------------------+
|  a|0.5|0.7071067811865476|
|  b|2.5|0.7071067811865476|
|  c|4.5|0.7071067811865476|
+---+---+------------------+

## Functions

In [87]:
spark.sql('''
SHOW FUNCTIONS "c*"
''').show(10)

+----------------+
|        function|
+----------------+
|     cardinality|
|            cast|
|            cbrt|
|            ceil|
|         ceiling|
|            char|
|     char_length|
|character_length|
|             chr|
|        coalesce|
+----------------+
only showing top 10 rows

## Complex Types

### Structs

In [19]:
df1 = spark.sql('''
SELECT a, b, (a, b) AS struct FROM df_table
''')

In [20]:
df1.createOrReplaceTempView('df1_table')

In [21]:
df1_table.show()

+---+---+------+
|  a|  b|struct|
+---+---+------+
|  0|  a|[0, a]|
|  1|  a|[1, a]|
|  2|  b|[2, b]|
|  3|  b|[3, b]|
|  4|  c|[4, c]|
|  5|  c|[5, c]|
+---+---+------+

In [23]:
spark.sql('''
SELECT struct.a, struct.b FROM df1_table
''').show()

+---+---+
|  a|  b|
+---+---+
|  0|  a|
|  1|  a|
|  2|  b|
|  3|  b|
|  4|  c|
|  5|  c|
+---+---+

### Lists

In [39]:
df2 = spark.sql('''
SELECT b, collect_list(a) as list
FROM df_table
GROUP BY b
ORDER BY b
''')

In [40]:
df2.createOrReplaceTempView('df2_table')

In [41]:
df2.show()

+---+------+
|  b|  list|
+---+------+
|  a|[0, 1]|
|  b|[2, 3]|
|  c|[4, 5]|
+---+------+

In [43]:
spark.sql('''
SELECT b, explode(list) as a
FROM df2_table
''').show()

+---+---+
|  b|  a|
+---+---+
|  a|  0|
|  a|  1|
|  b|  2|
|  b|  3|
|  c|  4|
|  c|  5|
+---+---+

### Maps

In [44]:
from pyspark.sql.functions import create_map

In [45]:
from itertools import chain

In [68]:
df3 = df.select('a', 'b', create_map('b', 'a').alias('map'))

In [69]:
df3.createOrReplaceTempView('df3_table')

In [70]:
df3.show()

+---+---+--------+
|  a|  b|     map|
+---+---+--------+
|  0|  a|[a -> 0]|
|  1|  a|[a -> 1]|
|  2|  b|[b -> 2]|
|  3|  b|[b -> 3]|
|  4|  c|[c -> 4]|
|  5|  c|[c -> 5]|
+---+---+--------+

In [74]:
spark.sql('''
SELECT a, b, map['c'] FROM df3_table
''').na.drop().show()

+---+---+------+
|  a|  b|map[c]|
+---+---+------+
|  4|  c|     4|
|  5|  c|     5|
+---+---+------+