# Basic Spark SQL Usage

### Example of using Spark SQL with Stroom DataFrame

In [44]:
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, col
from IPython.display import display
from pyspark.sql import SparkSession

#### Create a schema using XPaths

N.B. XPath @* is used to extract both StreamId and EventId from the Event, and placed into a single field.
This field has unique values, handy for working with SQL.

In [45]:
mySchema = StructType([StructField("user", StringType(), True, 
                                   metadata={"xpath": "EventSource/User/Id"}), 
                       StructField("operation", StringType(), True, 
                                   metadata={"xpath": "EventDetail/TypeId"}),
                     StructField("eventid", StringType(), False,
                                metadata={"xpath": "@*"})])

In [46]:
stroomDf = spark.read.format('stroom.spark.datasource.StroomDataSource').load(
        token='not required',host='localhost:8080',protocol='http',
        uri='api/stroom-index/v2',
        index='57a35b9a-083c-4a93-a813-fc3ddfe1ff44',pipeline='bb25824e-6369-464a-81e1-876ffe3b95a0',
        schema=mySchema).select('eventid','user','operation','idxUserId')

In [None]:
display(stroomDf.limit(5).toPandas().head())

#### Using Spark SQL

In order to start actually writing SQL queries, it is necessary to create a temporary view onto the 
Stroom DataFrame created above.

Results are returned as DataFrames themselves, making further operations possible.

In [None]:
stroomDf.createOrReplaceTempView("userops")
sqlDf = spark.sql("select * from userops where user='user1' and operation='0001'")

In [None]:
display(sqlDf.limit(5).toPandas().head())

In [None]:
sqlDf2 = spark.sql("select user,operation, count (eventid) as events from userops \
                    where idxUserId != 'User1' group by user, operation \
                    order by events desc")
display(sqlDf2.toPandas())