# Analyzing House Sales Data
We have house sales data in CSV format, we are going to use Spark to load and analyze the data

In [None]:
try:
    spark
except NameError:
    # initialize Spark Session
    import os
    import sys
    top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
    if top_dir not in sys.path:
        sys.path.append(top_dir)

    from init_spark import init_spark
    spark = init_spark()

print('Spark UI running on port ' + spark.sparkContext.uiWebUrl.split(':')[2])
spark

## Explore the data
The data is in `data/house-sales` directory.  
Go ahead and look at some [sample house sales data](../data/house-sales/house-sales-sample.csv).   
It is CSV data that looks like.

```
Date,SalePrice,SqFtLot,SqFtTotLiving,Bathrooms,Bedrooms,YrBuilt,ZipCode
1/3/06,436000,6923,2850,3,4,1947,98199
1/26/06,595000,7000,2970,2.25,3,1967,98199
2/7/06,618000,7680,2010,1.75,5,1950,98199
2/9/06,432500,7000,1670,1.5,4,1963,98199
2/17/06,725000,6000,4220,4.5,8,1957,98199
3/1/06,998000,5854,3680,3,4,1967,98199

```

## Step-1: Load CSV Data
We are going let Spark figure out the schema also.

In [None]:
data_location = "data/house-sales/house-sales-simplified.csv" 
# data_location =  's3://elephantscale-public/data/house-prices/house-sales-simplified.csv'
# data_location = 'https://elephantscale-public.s3.amazonaws.com/data/house-prices/house-sales-simplified.csv'


sales = spark.read.\
        option("header" ,"true").\
        option("inferSchema", "true").\
        csv(data_location)


In [None]:
sales.count()

In [None]:
sales.show()

In [None]:
# print schema
sales.printSchema()

## Step-2: Ready, Set, Analyze!

In [None]:
## use describe to understand 'SalePrice'
sales.describe("SalePrice").show()

In [None]:
## allcolumns
sales.describe().toPandas()

In [None]:
## Produce a report of 'sales by number of bedrooms' 
sales.groupBy("Bedrooms").count().show()


In [None]:
# sort the output
sales.groupBy("Bedrooms").count().orderBy("Bedrooms").show()