In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, count, dayofweek, explode, hour, lit, lower, regexp_replace, row_number, size, split, sum, udf
from pyspark.ml.feature import NGram
from pyspark.sql.types import StringType
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("dh3382-hw2").getOrCreate()
BAKERY_PATH = '/FileStore/tables/BreadBasket_DMS.csv'
REST_PATH = '/FileStore/tables/Restaurants_in_Durham_County_NC.csv'
POP_PATH = '/FileStore/tables/populationbycountry19802010millions.csv'
WORD_DIR_PATH = '/FileStore/tables/hw1text/'

In [0]:
# read and clean bakery data
bakery_data = spark.read.options(header=True, inferSchema=True).csv(BAKERY_PATH)
# remove NONE entries
bakery_data = bakery_data.filter(col('Item') != 'NONE')

In [0]:
##### QUESTION 1: Show the total number bought by item, per day, between 11AM and 1PM #####

# filter out all transactions outside of specified time range (11:00-13:00 inclusive)
bakery_data_lunch = bakery_data.filter(col('Time').between('11:00:00', '13:00:00'))
# group by Item and Day purchased, then get total count of items per day purchased between 11:00-13:00
bakery_q1_res = bakery_data_lunch.groupBy(col('Item'), col('Date')).count().withColumnRenamed('count', 'Quantity')
# DF with answer to question 1
bakery_q1_res.show()

+-----------------+----------+--------+
|             Item|      Date|Quantity|
+-----------------+----------+--------+
|         Focaccia|2016-11-03|       1|
|          Tartine|2016-11-04|       1|
|            Bread|2016-12-13|       4|
|           Coffee|2017-01-05|       7|
|     Scandinavian|2017-01-20|       1|
|         Art Tray|2017-01-24|       1|
|            Bread|2017-03-22|       6|
| Coffee granules |2017-03-25|       1|
|           Muffin|2016-11-14|       1|
|           Coffee|2016-11-24|       7|
|Gingerbread syrup|2016-12-21|       1|
|           Coffee|2017-01-06|       8|
|     Scandinavian|2017-01-07|       3|
|            Bread|2017-01-11|       6|
|           Coffee|2017-02-01|       4|
|     Chicken Stew|2017-02-08|       1|
|    Hot chocolate|2017-02-09|       1|
|       Farm House|2017-02-12|       1|
|          Brownie|2017-02-13|       1|
|            Bread|2017-02-27|       5|
+-----------------+----------+--------+
only showing top 20 rows



In [0]:
##### QUESTION 2: Show the top 3 (by qty) items bought by Daypart, by DayType #####
#udf to define weekend/weekday
def weekend(is_weekend):
    if is_weekend:
        return "Weekend"
    else:
        return "Weekday"

weekend_udf = udf(lambda is_weekend: weekend(is_weekend), StringType())


# udf to define part of day
def day_part(date):
    if (date.hour >= 7) & (date.hour < 12):
        return "Morning"
    elif date.hour < 17:
        return "Afternoon"
    else:
        return "Night"

day_part_udf = udf(lambda hour: day_part(hour), StringType())

In [0]:
# use weekend udf to add Daytype column, pass in boolean with sparkSQL dayofweek and isin functions
bakery_weekend = bakery_data.withColumn('Daytype', weekend_udf(dayofweek(col('Date') ).isin([1, 7]) ) )
# use day_part udf to add Daypart column
bakery_daypart = bakery_weekend.withColumn('Daypart', day_part_udf(col('Time') ) )
#group by Daypart and Daytype and count purchases
bakery_purchases = bakery_daypart.groupBy(col('Daypart'), col("Daytype"), col('Item') ).count().withColumnRenamed('count', 'Purchases')

# partition by Daypart, Daytype and find top 3 Purchases for each Daypart/Daytime combination
windowBakery = Window.partitionBy('Daypart', 'Daytype').orderBy(col('Purchases').desc())
bakery_top_purchases = bakery_purchases.withColumn("row",row_number()\
  .over(windowBakery)) \
  .filter(col("row") <= 3) \

# Collect all items into single row grouped by Daypart/Daytype
bakery_q2_res = bakery_top_purchases.groupBy(col('Daypart'), col("Daytype")).agg(collect_list('Item').alias('Top_3_Items'))
# Select columns in order Daypart/Top_3_Items/Daytype for cleaner output format
bakery_q2_res.select(col('Daypart'), col('Top_3_Items'), col('Daytype') ).show(truncate=False)


+---------+------------------------------------------+-------+
|Daypart  |Top_3_Items                               |Daytype|
+---------+------------------------------------------+-------+
|Afternoon|[Coffee, Bread, Tea]                      |Weekday|
|Afternoon|[Coffee, Bread, Tea]                      |Weekend|
|Morning  |[Coffee, Bread, Pastry]                   |Weekday|
|Morning  |[Coffee, Bread, Pastry]                   |Weekend|
|Night    |[Coffee, Bread, Tea]                      |Weekday|
|Night    |[Coffee, Tshirt, Afternoon with the baker]|Weekend|
+---------+------------------------------------------+-------+



In [0]:
##### QUESTION 3: The total number of entities by “rpt_area_desc” #####
# set European style csv delimiter (;)
rest_data_raw = spark.read.options(header=True, delimiter=';', inferSchema=True)\
    .csv(REST_PATH)

In [0]:
# group by rpt area desc, aggregate count as 'Total', then sort in descending order and show top 3
rest_q3_res = rest_data_raw.groupBy(col('Rpt_Area_Desc'))\
    .agg(count('*').alias('Total') )\
    .sort(col('Total'), ascending=False)\
    .show(3)

+--------------+-----+
| Rpt_Area_Desc|Total|
+--------------+-----+
|  Food Service| 1093|
|Swimming Pools|  420|
|   Summer Food|  242|
+--------------+-----+
only showing top 3 rows



In [0]:
#####  QUESTION 4: Show the top 10 regions with the biggest percentage decrease in population, for the years 1990-2000 #####

pop_data_raw = spark.read.options(header=True, inferSchema=True)\
    .csv(POP_PATH)

# first column name is Null in file, find col name through columns attribute and rename first column to region
cols = pop_data_raw.columns 
pop_data_clean_header = pop_data_raw.select(col(cols[0]).alias('Region'), col('1990'), col('2000') )

# remove regions with no population data (e.g. Antarctica, Wake Island, Croatia, Former U.S.S.R.)
pop_data_clean_pop_nums = pop_data_clean_header.filter((col('1990')  != 'NA') &  (col('2000')  != 'NA') )
pop_data_clean_pop_nums = pop_data_clean_pop_nums.filter((col('1990')  != '--') &  (col('2000')  != '--') )
print(pop_data_clean_pop_nums.count() )

# remove leftover aggregate regions
pop_data_clean_regions = pop_data_clean_pop_nums\
    .filter((col('Region') != 'World') & (col('Region') != 'North America') & (col('Region') != 'Central & South America') & (col('Region') != 'Eurasia') & (col('Region') != 'Western Sahara') & (col('Region') != 'Asia & Oceania') )

196


In [0]:
# calculate gross increase
pop_decrease_gross = pop_data_clean_regions.withColumn('gross_increase', col('2000') - col('1990') )

# remove regions with positive population growth
pop_decrease_gross = pop_decrease_gross.filter(col('gross_increase') <= 0)

# calculate percentage of growth increase
pop_decrease_perc = pop_decrease_gross.withColumn('perc_increase', col('gross_increase')/col('1990') )

# create DF sorted in ascending order of pop increase, only contains Region and perc_increase 
pop_q4_res = pop_decrease_perc.sort(col('perc_increase') )\
    .select(col('Region'), col('perc_increase') )

# show top 3 results
pop_q4_res.show(3)

+------------+--------------------+
|      Region|       perc_increase|
+------------+--------------------+
|  Montserrat| -0.6318732525629077|
|    Bulgaria|-0.12092718374010437|
|Cook Islands|-0.11310494834148986|
+------------+--------------------+
only showing top 3 rows



In [0]:
##### QUESTION 5 #####
"""
Do word count exercise using pyspark. Ignore punctuation and normalize to lower case. Replace characters NOT in this set: [0-9a-z] with space.
"""
word_df = spark.read.text(WORD_DIR_PATH)
# normalize to lower case
word_df_lower = word_df.select(lower(col('value')).alias('words') )
# replace punctuation with space
word_df_parsed = word_df_lower.withColumn('words', regexp_replace('words', '[^a-z0-9]', ' ') )

In [0]:
# create count column with number of words per row
word_df_with_count = word_df_parsed.withColumn("count", size(split('words', " ") ) )
# sum count column for Q5 answer
word_q5_res = word_df_with_count.agg(sum(col('count')).alias('Final_Word_Count') )
word_q5_res.show()

+----------------+
|Final_Word_Count|
+----------------+
|         4243319|
+----------------+



In [0]:
##### QUESTION 6: Find the 10 most common bigrams #####

# convert strings to arrays of strings for NGram transformation
word_df_str_arrays = word_df_parsed.withColumn('words', split('words', ' ') )

# transform word_df to bigram_df using NGram feature
bigram = NGram(n=2).setInputCol('words').setOutputCol('bigrams')
bigram_df = bigram.transform(word_df_str_arrays)

# remove unnecessary words column
bigram_df = bigram_df.select(col('bigrams'))
# explode bigrams so each has own separate row
bigram_df = bigram_df.withColumn('bigrams', explode(col('bigrams') ) )
# group by bigram and count occurences
bigram_df = bigram_df.groupBy(col('bigrams') ).count()
# sort on bigram count for q5 res
bigram_q5_res = bigram_df.sort('count', ascending=False)
# show top 10 bigrams
bigram_q5_res.show(10)

+-------+------+
|bigrams| count|
+-------+------+
|       |769824|
|      p| 77053|
|     p | 76093|
|    the| 30312|
|      s| 21879|
| of the| 17436|
|    and| 13404|
| in the| 12777|
|     h |  8928|
|      h|  8903|
+-------+------+
only showing top 10 rows

