# Spark SQL Demo

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('My First Spark application') \
    .getOrCreate() 

sc = spark.sparkContext

Read in some data:

In [None]:
business = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_business.json.gz')

In [None]:
checkin = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_checkin.json.gz')
review = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_review.json.gz')
tip = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_tip.json.gz')
user = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_user.json.gz')

Look at a schema:

In [None]:
tip.printSchema()

and some data:

In [None]:
tips = tip.first().asDict()

In [None]:
for k,v in tips.items():
    print(f'{k}:\t{v}')

Create a temporary view and issue a SQL query:

In [None]:
tip.createOrReplaceTempView("tip")
result = spark.sql("SELECT COUNT(*) FROM tip")

Show the results:

In [None]:
result.show()

Does it match what we would expect?

In [None]:
tip.count()

Go ahead and create temp views for all our DataFrames:

In [None]:
business.createOrReplaceTempView("business")
checkin.createOrReplaceTempView("checkin")
tip.createOrReplaceTempView("tip")
review.createOrReplaceTempView("review")
user.createOrReplaceTempView("user")

## Simple SQL query

In [None]:
result = spark.sql("SELECT * FROM business WHERE state='QC'")
result.show()

We often count things:

In [None]:
result = spark.sql("SELECT count(*) FROM business WHERE state='QC'")
result.show()

Clean up the column name:

In [None]:
result = spark.sql("SELECT count(*) AS the_count FROM business WHERE state='QC'")
result.show()

Show some grouping (and query formatting):

In [None]:
query = """
SELECT state,count(*) 
    FROM business 
    GROUP BY state
"""
result = spark.sql(query)
result.show()

In [None]:
query = """
SELECT state,count(*) as count
    FROM business 
    GROUP BY state
    ORDER BY count(*) DESC
"""
result = spark.sql(query)
result.show()

Joining two tables:

In [None]:
query = """
SELECT business.name, tip.text 
    FROM tip 
    LEFT JOIN business 
        ON tip.business_id = business.business_id
"""
result = spark.sql(query)
result.show()

## Sampling

In [None]:
query = """
SELECT * FROM checkin TABLESAMPLE (1 PERCENT)
"""
result = spark.sql(query)
result.count()

## User-defined functions

In [None]:
def square(x):
    return x*x

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
square_udf_int = udf(lambda z: square(z), IntegerType())

In [None]:
spark.udf.register('square_udf_int',square)

In [None]:
query = """
SELECT square_udf_int(compliment_count) AS compliments_squared 
   FROM tip 
   ORDER BY compliments_squared DESC
"""
result = spark.sql(query)
result.show()

## Pandas interoperability

In [None]:
query = """
SELECT state,count(*) as count
    FROM business 
    GROUP BY state
    ORDER BY count(*) DESC
"""
result = spark.sql(query)
result.show()

In [None]:
pandas_result = result.toPandas()

In [None]:
pandas_result