In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, dayofweek, hour, udf
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName("dh3382-hw2").getOrCreate()

In [0]:
bakery_data = spark.read.options(header=True, inferSchema=True).csv('/FileStore/tables/BreadBasket_DMS.csv')
# remove NONE entries
bakery_data = bakery_data.filter(col('Item') != 'NONE')

In [0]:
##### QUESTION 1: Show the total number bought by item, per day, between 11AM and 1PM #####

# filter out all transactions outside of specified time range (11:00-13:00 inclusive)
bakery_data_lunch = bakery_data.filter(col('Time').between('11:00:00', '13:00:00'))
# group by Item and Day purchased, then get total count of items per day purchased between 11:00-13:00
bakery_q1_res = bakery_data_lunch.groupBy(col('Item'), col('Date')).count().withColumnRenamed('count', 'Quantity')
# DF with answer to question 1
bakery_q1_res.show()

+-----------------+----------+--------+
|             Item|      Date|Quantity|
+-----------------+----------+--------+
|         Focaccia|2016-11-03|       1|
|          Tartine|2016-11-04|       1|
|            Bread|2016-12-13|       4|
|           Coffee|2017-01-05|       7|
|     Scandinavian|2017-01-20|       1|
|         Art Tray|2017-01-24|       1|
|            Bread|2017-03-22|       6|
| Coffee granules |2017-03-25|       1|
|           Muffin|2016-11-14|       1|
|           Coffee|2016-11-24|       7|
|Gingerbread syrup|2016-12-21|       1|
|           Coffee|2017-01-06|       8|
|     Scandinavian|2017-01-07|       3|
|            Bread|2017-01-11|       6|
|           Coffee|2017-02-01|       4|
|     Chicken Stew|2017-02-08|       1|
|    Hot chocolate|2017-02-09|       1|
|       Farm House|2017-02-12|       1|
|          Brownie|2017-02-13|       1|
|            Bread|2017-02-27|       5|
+-----------------+----------+--------+
only showing top 20 rows



In [0]:
##### QUESTION 2: Show the top 3 (by qty) items bought by Daypart, by DayType #####
# udf to define part of day
def day_part(hour):
    if hour >= 6 & hour < 12:
        return "Morning"
    elif hour < 18:
        return "Afternoon"
    else:
        return "Night"

day_part_udf = udf(lambda hour: day_part(hour), StringType())

In [0]:
bakery_weekend = bakery_data.withColumn('is_weekend', dayofweek(col('Date') ).isin([1, 7]) )
bakery_hour = bakery_weekend.withColumn('Time', hour(col('Time') ) )
bakery_daypart = bakery_hour.withColumn('daypart', day_part_udf(col('Time') ) ).show(115)

+----------+----+-----------+--------------------+----------+-------+
|      Date|Time|Transaction|                Item|is_weekend|daypart|
+----------+----+-----------+--------------------+----------+-------+
|2016-10-30|   9|          1|               Bread|      true|Morning|
|2016-10-30|  10|          2|        Scandinavian|      true|Morning|
|2016-10-30|  10|          2|        Scandinavian|      true|Morning|
|2016-10-30|  10|          3|       Hot chocolate|      true|Morning|
|2016-10-30|  10|          3|                 Jam|      true|Morning|
|2016-10-30|  10|          3|             Cookies|      true|Morning|
|2016-10-30|  10|          4|              Muffin|      true|Morning|
|2016-10-30|  10|          5|              Coffee|      true|Morning|
|2016-10-30|  10|          5|              Pastry|      true|Morning|
|2016-10-30|  10|          5|               Bread|      true|Morning|
|2016-10-30|  10|          6|           Medialuna|      true|Morning|
|2016-10-30|  10|   

In [0]:
##### QUESTION 3: The total number of entities by “rpt_area_desc” #####
# set European style csv delimiter (;)
rest_data_raw = spark.read.options(header=True, delimiter=';', inferSchema=True)\
    .csv('/FileStore/tables/Restaurants_in_Durham_County_NC.csv')

In [0]:
# group by rpt area desc, aggregate count as 'Total', then sort in descending order and show top 3
rest_q3_res = rest_data_raw.groupBy(col('Rpt_Area_Desc'))\
    .agg(count('*').alias('Total') )\
    .sort(col('Total'), ascending=False)\
    .show(3)

+--------------+-----+
| Rpt_Area_Desc|Total|
+--------------+-----+
|  Food Service| 1093|
|Swimming Pools|  420|
|   Summer Food|  242|
+--------------+-----+
only showing top 3 rows



In [0]:
#####  QUESTION 4: Show the top 10 regiona with the biggest percentage decrease in population, for the years 1990-2000 #####

pop_data_raw = spark.read.options(header=True, inferSchema=True)\
    .csv('/FileStore/tables/populationbycountry19802010millions.csv')

# first column name is Null in file, find col name through columns attribute and rename first column to region
cols = pop_data_raw.columns 
pop_data_clean_header = pop_data_raw.select(col(cols[0]).alias('Region'), col('1990'), col('2010') )

# remove regions with no population data (e.g. Antarctica, Wake Island, Croatia, Former U.S.S.R.)
pop_data_clean_pop_nums = pop_data_clean_header.filter((col('1990')  != 'NA') &  (col('2010')  != 'NA'))
pop_data_clean_pop_nums = pop_data_clean_pop_nums.filter((col('1990')  != '--') &  (col('2010')  != '--'))
print(pop_data_clean_pop_nums.count())

# remove leftover aggregate regions
pop_data_clean_regions = pop_data_clean_pop_nums\
    .filter((col('Region') != 'World') & (col('Region') != 'North America') & (col('Region') != 'Central & South America') & (col('Region') != 'Eurasia') & (col('Region') != 'Western Sahara') & (col('Region') != 'Asia & Oceania'))

196


In [0]:
# calculate gross increase
pop_decrease_gross = pop_data_clean_regions.withColumn('gross_increase', col('2010') - col('1990'))

# remove regions with positive population growth
pop_decrease_gross = pop_decrease_gross.filter(col('gross_increase') <= 0)

# calculate percentage of growth increase
pop_decrease_perc = pop_decrease_gross.withColumn('perc_increase', col('gross_increase')/col('1990'))

# create DF sorted in ascending order of pop increase, only contains Region and perc_increase 
pop_q4_res = pop_decrease_perc.sort(col('perc_increase'))\
    .select(col('Region'), col('perc_increase'))

# show top 3 results
pop_q4_res.show(3)

+------------+--------------------+
|      Region|       perc_increase|
+------------+--------------------+
|  Montserrat| -0.5228331780055917|
|Cook Islands|-0.37520391517128876|
|    Bulgaria|-0.19622600778274873|
+------------+--------------------+
only showing top 3 rows

