In [1]:
#importing pyspark
import pyspark
from pyspark.sql.functions import col

In [2]:
# import sparksession
from pyspark.sql import SparkSession

In [3]:
#creating an instance of the sparksession
spark = SparkSession.builder.appName('Test').getOrCreate()

In [4]:
spark

In [5]:
# The best way to import our data while making first row header and also allowing spark to infer the data types
df = spark.read.csv('real_estate_price_size_year_view.csv', header=True, inferSchema=True)

In [6]:
df.show(5)

+----------+-------+----+-----------+
|     price|   size|year|       view|
+----------+-------+----+-----------+
|234314.144| 643.09|2015|No sea view|
|228581.528| 656.22|2009|No sea view|
|281626.336| 487.29|2018|   Sea view|
|401255.608|1504.75|2015|No sea view|
|458674.256|1275.46|2009|   Sea view|
+----------+-------+----+-----------+
only showing top 5 rows



In [7]:
df.printSchema()

root
 |-- price: double (nullable = true)
 |-- size: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- view: string (nullable = true)



In [8]:
type(df)

pyspark.sql.dataframe.DataFrame

In [9]:
df.columns

['price', 'size', 'year', 'view']

In [10]:
# checking out the price and size columns
df.select(['price', 'size']).show(5)

+----------+-------+
|     price|   size|
+----------+-------+
|234314.144| 643.09|
|228581.528| 656.22|
|281626.336| 487.29|
|401255.608|1504.75|
|458674.256|1275.46|
+----------+-------+
only showing top 5 rows



In [11]:
df.dtypes

[('price', 'double'), ('size', 'double'), ('year', 'int'), ('view', 'string')]

In [12]:
# how to use describe to summarize the dataframe
df.describe().show()

+-------+------------------+-----------------+-----------------+-----------+
|summary|             price|             size|             year|       view|
+-------+------------------+-----------------+-----------------+-----------+
|  count|               100|              100|              100|        100|
|   mean|292289.47015999985|         853.0242|           2012.6|       null|
| stddev| 77051.72752473492|297.9419506683147|4.729020655869068|       null|
|    min|        154282.128|           479.75|             2006|No sea view|
|    max|        500681.128|          1842.51|             2018|   Sea view|
+-------+------------------+-----------------+-----------------+-----------+



In [13]:
# checking for null values 
null_columns = [column for column in df.columns if df.filter(col(column).isNull()).count() > 0]

In [14]:
null_columns

[]

There are no null values in the data

### Analysis of our Data

Checking for houses with prices greater than the mean

In [15]:
df.filter(df['price'] > 292289.47015999985).show(10)

+----------+-------+----+-----------+
|     price|   size|year|       view|
+----------+-------+----+-----------+
|401255.608|1504.75|2015|No sea view|
|458674.256|1275.46|2009|   Sea view|
|331101.344| 682.26|2018|   Sea view|
|494778.992|1842.51|2009|   Sea view|
|418753.008|1009.25|2018|No sea view|
|444192.008|1300.96|2006|   Sea view|
|440201.616|1379.72|2006|   Sea view|
|299416.976|1027.76|2018|No sea view|
|412569.472|1207.45|2015|   Sea view|
| 362519.72| 1103.3|2018|   Sea view|
+----------+-------+----+-----------+
only showing top 10 rows



In [16]:
dfa = df.filter(df['price'] > 292289.47015999985)
dfa.count()

45

There are 45 houses with price above the mean

In [17]:
# checking for seaview houses that aare over the mean
df1 = df.filter((df['price'] > 292289.47015999985) & (df['view'] == 'Sea view'))

In [18]:
df1.count()

29

there are 29 houses with sea view and 16 houses without suggesting that the houses prices are a bit higher when their is sea view

In [19]:
# we will check for houses with both price and size above their mean
df.filter((df['price'] > 292289.47015999985) & (df['size'] > 853.0242)).show(10)

+----------+-------+----+-----------+
|     price|   size|year|       view|
+----------+-------+----+-----------+
|401255.608|1504.75|2015|No sea view|
|458674.256|1275.46|2009|   Sea view|
|494778.992|1842.51|2009|   Sea view|
|418753.008|1009.25|2018|No sea view|
|444192.008|1300.96|2006|   Sea view|
|440201.616|1379.72|2006|   Sea view|
|299416.976|1027.76|2018|No sea view|
|412569.472|1207.45|2015|   Sea view|
| 362519.72| 1103.3|2018|   Sea view|
|406852.304| 1334.1|2015|   Sea view|
+----------+-------+----+-----------+
only showing top 10 rows



In [20]:
df2 = df.filter((df['price'] > 292289.47015999985) & (df['size'] > 853.0242))
df2.count()

33

this means that out of the 45 houses with above average prices, most of them had above average prices also

In [21]:
# Grouping by year to determine which year had the highest prices
df.groupBy('year').sum('price').show()

+----+-----------------+
|year|       sum(price)|
+----+-----------------+
|2018|       7914419.64|
|2015|9596620.560000002|
|2006|6743314.543999999|
|2009|      4974592.272|
+----+-----------------+



we can see that 2015 had the highest total house prices while 2019 had the least

In [22]:
# what was the distribution of houses per year
df.groupBy('year').count().show()

+----+-----+
|year|count|
+----+-----+
|2018|   26|
|2015|   33|
|2006|   24|
|2009|   17|
+----+-----+



2015 had the highest entries with 2009 having the least and this tallies with our sum of prices

In [23]:
# which view had the highest total in house prices
df.groupBy('view').sum('price').show()

+-----------+--------------------+
|       view|          sum(price)|
+-----------+--------------------+
|   Sea view|1.5908449192000002E7|
|No sea view|1.3320497824000001E7|
+-----------+--------------------+



Sea view had the higher price sum, lets also check our count if they are the same

In [24]:
df.groupBy('view').count().show()

+-----------+-----+
|       view|count|
+-----------+-----+
|   Sea view|   49|
|No sea view|   51|
+-----------+-----+



They are almost the same with No sea view just slightly higher meaning that the houses with sea view are pricier 