# Dataframes 3

Process structured data with Spark.

Here we will read a CSV file using Spark and perform queries.

We will specify schema

## Step 1 - Initialize Spark

In [None]:
try:
    spark
except NameError:
    import findspark
    findspark.init()  # uses SPARK_HOME
    print("Spark found in : ", findspark.find())

    import pyspark
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    # use a unique tmep dir for warehouse dir, so we can run multiple spark sessions in one dir
    import tempfile
    tmpdir = tempfile.TemporaryDirectory()

    config = ( SparkConf()
             .setAppName("TestApp")
             .setMaster("local[*]")
             .set('executor.memory', '2g')
             .set('spark.sql.warehouse.dir', tmpdir.name)
             .set("some_property", "some_value") # another example
             )

    spark = SparkSession.builder.config(conf=config).getOrCreate()

print('Spark UI running on port ' + spark.sparkContext.uiWebUrl.split(':')[2])

## Step 2 - Define Schema

We will specify schema

In [None]:
from pyspark.sql.types import *

my_schema = StructType([
    StructField("ID", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("SalePrice", IntegerType(), True),
    StructField("PropertyID", StringType(), True),
    StructField("PropertyType", StringType(), True),
    StructField("Bedrooms", IntegerType(), True),
    StructField("Bathrooms", FloatType(), True),
    StructField("SqFtTotLiving", IntegerType(), True),
    StructField("SqFtLot", IntegerType(), True),
    StructField("YrBuilt", IntegerType(), True),
    StructField("ZipCode", IntegerType(), True),
])

## Step 2 - Read Data

In [None]:
house_sales = spark.read.schema(my_schema).csv('../data/house-sales/house-sales-simplified.csv', 
                             header=True)
house_sales.printSchema()
house_sales.show()

## Step 3 - Perform an aggregate query

Find average sale price per bedrooms

In [None]:
house_sales.groupby('Bedrooms').avg('SalePrice').orderBy('Bedrooms').show()

## Step 4 - Bonus Lab

Specify "Date" as DateType.  And re-read the data