In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [8]:
!head data/RealEstate.csv

MLS,Location,Price,Bedrooms,Bathrooms,Size,Price SQ Ft,Status
132842,Arroyo Grande,795000.00,3,3,2371,335.30,Short Sale
134364,Paso Robles,399000.00,4,3,2818,141.59,Short Sale
135141,Paso Robles,545000.00,4,3,3032,179.75,Short Sale
135712,Morro Bay,909000.00,4,4,3540,256.78,Short Sale
136282,Santa Maria-Orcutt,109900.00,3,1,1249,87.99,Short Sale
136431,Oceano,324900.00,3,3,1800,180.50,Short Sale
137036,Santa Maria-Orcutt,192900.00,4,2,1603,120.34,Short Sale
137090,Santa Maria-Orcutt,215000.00,3,2,1450,148.28,Short Sale
137159,Morro Bay,999000.00,4,3,3360,297.32,Short Sale


In [9]:
schema = 'MLS string, Location string, Price double, Bedrooms int, Bathrooms int, Size double, PriceSqFt double, Status string'
df = spark.read.csv('data/RealEstate.csv', sep=',', header=True, schema=schema)

In [10]:
# Average house price per sq ft using SQL query
df.createOrReplaceTempView('real_estate')

spark.sql('''
SELECT
  Location,
  AVG(PriceSqFt) AS AvgPriceSqFt
FROM real_estate
GROUP BY
  Location
ORDER BY
  2 ASC''').show()

+-------------------+------------------+
|           Location|      AvgPriceSqFt|
+-------------------+------------------+
|         New Cuyama|             34.05|
|        Bakersfield|             69.69|
|          King City| 71.51333333333334|
|         Greenfield|             91.58|
|    Santa Margarita|             95.38|
|            Soledad|102.69333333333333|
|        Out Of Area|116.23333333333333|
|          Guadalupe|           120.175|
|           Coalinga|124.34285714285714|
| Santa Maria-Orcutt|147.58871698113194|
|             Lompoc|             149.9|
|             Lompoc|159.87115384615387|
|         San Miguel|163.16071428571425|
|            Bradley|            166.81|
|            Creston|            181.76|
| Santa Maria-Orcutt|183.03692307692307|
|             Nipomo|187.92333333333332|
|        Paso Robles|191.17752941176474|
|         Los Alamos|191.99333333333334|
|            Solvang|           193.305|
+-------------------+------------------+
only showing top

In [11]:
# Average house price per sq ft using SQL api
df.groupBy('Location') \
  .avg('PriceSqFt') \
  .orderBy('avg(PriceSqFt)') \
  .show()

+-------------------+------------------+
|           Location|    avg(PriceSqFt)|
+-------------------+------------------+
|         New Cuyama|             34.05|
|        Bakersfield|             69.69|
|          King City| 71.51333333333334|
|         Greenfield|             91.58|
|    Santa Margarita|             95.38|
|            Soledad|102.69333333333333|
|        Out Of Area|116.23333333333333|
|          Guadalupe|           120.175|
|           Coalinga|124.34285714285714|
| Santa Maria-Orcutt|147.58871698113194|
|             Lompoc|             149.9|
|             Lompoc|159.87115384615387|
|         San Miguel|163.16071428571425|
|            Bradley|            166.81|
|            Creston|            181.76|
| Santa Maria-Orcutt|183.03692307692307|
|             Nipomo|187.92333333333332|
|        Paso Robles|191.17752941176474|
|         Los Alamos|191.99333333333334|
|            Solvang|           193.305|
+-------------------+------------------+
only showing top