# Spark SQL

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder 
    .master("local") 
    .appName("BIOS-823") 
    .config("spark.executor.cores", 4) 
    .getOrCreate()    
)

In [3]:
import pandas as pd

In [4]:
df = spark.createDataFrame(pd.DataFrame(dict(a=range(6), b=list('aabbcc'))))

In [5]:
df.show()

+---+---+
|  a|  b|
+---+---+
|  0|  a|
|  1|  a|
|  2|  b|
|  3|  b|
|  4|  c|
|  5|  c|
+---+---+



In [6]:
df.createOrReplaceTempView('df_table')

## Standard SQL Queries

In [7]:
spark.sql('''
SELECT * FROM df_table
''').show()

+---+---+
|  a|  b|
+---+---+
|  0|  a|
|  1|  a|
|  2|  b|
|  3|  b|
|  4|  c|
|  5|  c|
+---+---+



In [8]:
spark.sql('''
SELECT * 
FROM df_table
WHERE b='b'
''').show()

+---+---+
|  a|  b|
+---+---+
|  2|  b|
|  3|  b|
+---+---+



In [9]:
spark.sql('''
SELECT b, mean(a) AS avg, std(a) AS std
FROM df_table
GROUP BY b
ORDER BY b
''').show()

+---+---+------------------+
|  b|avg|               std|
+---+---+------------------+
|  a|0.5|0.7071067811865476|
|  b|2.5|0.7071067811865476|
|  c|4.5|0.7071067811865476|
+---+---+------------------+



## Functions

In [10]:
spark.sql('''
SHOW FUNCTIONS "c*"
''').show(10)

+----------------+
|        function|
+----------------+
|     cardinality|
|            case|
|            cast|
|            cbrt|
|            ceil|
|         ceiling|
|            char|
|     char_length|
|character_length|
|             chr|
+----------------+
only showing top 10 rows



## Complex Types

### Structs

In [11]:
df1 = spark.sql('''
SELECT a, b, (a, b) AS struct FROM df_table
''')

In [12]:
df1.show()

+---+---+------+
|  a|  b|struct|
+---+---+------+
|  0|  a|[0, a]|
|  1|  a|[1, a]|
|  2|  b|[2, b]|
|  3|  b|[3, b]|
|  4|  c|[4, c]|
|  5|  c|[5, c]|
+---+---+------+



In [13]:
df1.createOrReplaceTempView('df1_table')

In [14]:
spark.sql('''
SELECT struct.a, struct.b FROM df1_table
''').show()

+---+---+
|  a|  b|
+---+---+
|  0|  a|
|  1|  a|
|  2|  b|
|  3|  b|
|  4|  c|
|  5|  c|
+---+---+



### Lists

In [15]:
df2 = spark.sql('''
SELECT b, collect_list(a) as list
FROM df_table
GROUP BY b
ORDER BY b
''')

In [16]:
df2.createOrReplaceTempView('df2_table')

In [17]:
df2.show()

+---+------+
|  b|  list|
+---+------+
|  a|[0, 1]|
|  b|[2, 3]|
|  c|[4, 5]|
+---+------+



In [18]:
spark.sql('''
SELECT b, explode(list) as a
FROM df2_table
''').show()

+---+---+
|  b|  a|
+---+---+
|  a|  0|
|  a|  1|
|  b|  2|
|  b|  3|
|  c|  4|
|  c|  5|
+---+---+



### Maps

In [19]:
from pyspark.sql.functions import create_map

In [20]:
from itertools import chain

In [21]:
df3 = df.select('a', 'b', create_map('b', 'a').alias('map'))

In [22]:
df3.createOrReplaceTempView('df3_table')

In [23]:
df3.show()

+---+---+--------+
|  a|  b|     map|
+---+---+--------+
|  0|  a|[a -> 0]|
|  1|  a|[a -> 1]|
|  2|  b|[b -> 2]|
|  3|  b|[b -> 3]|
|  4|  c|[c -> 4]|
|  5|  c|[c -> 5]|
+---+---+--------+



In [24]:
spark.sql('''
SELECT a, b, map['c'] FROM df3_table
''').na.drop().show()

+---+---+------+
|  a|  b|map[c]|
+---+---+------+
|  4|  c|     4|
|  5|  c|     5|
+---+---+------+



In [25]:
df3.select('a', 'b', 'map.c').filter('c is not null').show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  4|  c|  4|
|  5|  c|  5|
+---+---+---+

