In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [5]:
!head data/RealEstate.csv

MLS,Location,Price,Bedrooms,Bathrooms,Size,Price SQ Ft,Status
132842,Arroyo Grande,795000.00,3,3,2371,335.30,Short Sale
134364,Paso Robles,399000.00,4,3,2818,141.59,Short Sale
135141,Paso Robles,545000.00,4,3,3032,179.75,Short Sale
135712,Morro Bay,909000.00,4,4,3540,256.78,Short Sale
136282,Santa Maria-Orcutt,109900.00,3,1,1249,87.99,Short Sale
136431,Oceano,324900.00,3,3,1800,180.50,Short Sale
137036,Santa Maria-Orcutt,192900.00,4,2,1603,120.34,Short Sale
137090,Santa Maria-Orcutt,215000.00,3,2,1450,148.28,Short Sale
137159,Morro Bay,999000.00,4,3,3360,297.32,Short Sale


In [11]:
schema = 'MLS string, Location string, Price double, Bedrooms int, Bathrooms int, Size double, PriceSqFt double, Status string'
df = spark.read.csv('data/RealEstate.csv', sep=',', header=True, schema=schema)

In [57]:
# Average price using groupBy + avg
df.groupBy('Bedrooms') \
  .avg('Price') \
  .rdd \
  .map(lambda r: (r['Bedrooms'], r['avg(Price)'])) \
  .collect()

[(1, 169981.81818181818),
 (6, 603225.0),
 (3, 359062.20649651974),
 (5, 657858.0645161291),
 (4, 483475.6497175141),
 (7, 325000.0),
 (10, 699000.0),
 (2, 266356.3739837398),
 (0, 293450.0)]

In [55]:
# Average price using reduceByKey + mapValues
df.rdd \
  .map(lambda h: (h.Bedrooms, (h.Price, 1))) \
  .reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1])) \
  .mapValues(lambda v: v[0]/v[1]) \
  .collect()

[(0, 293450.0),
 (1, 169981.81818181818),
 (2, 266356.3739837398),
 (3, 359062.20649651974),
 (4, 483475.6497175141),
 (5, 657858.0645161291),
 (6, 603225.0),
 (7, 325000.0),
 (10, 699000.0)]