In [28]:
import time

In [29]:
from pyspark import SparkContext, SparkConf
# If this throws a socket error, stop the context and re-run the code again, this error came up for us, 
# and we didn't know how to fix it, but re-running the code fixed it
conf = SparkConf().set('spark.executor.memory', '16g')\
                    .set('spark.driver.memory', '16g')\
                    .setAppName("beer_analysis")\
                    .setMaster('local[*]')
sc = SparkContext(conf=conf)
sc

In [30]:
elapsed_times_list = []
start_time_overall = time.time()

In [31]:
# STEP 1
start_time1 = time.time()

rdd = sc.textFile('beer_reviews.csv')
header = rdd.first()
rdd = rdd.filter(lambda line: line != header)
split_rdd = rdd.map(lambda x: x.split(','))

end_time1 = time.time()
elapsed_time1 = end_time1 - start_time1
elapsed_times_list.append(elapsed_time1)
print(f"Elapsed time: {elapsed_time1} seconds")

Elapsed time: 1.1826894283294678 seconds


In [32]:
# STEP 2
start_time2 = time.time()

review_overall_rdd = split_rdd.map(lambda x: (float(x[4]), x))
mapped_rdd = review_overall_rdd.filter(lambda x: x[0] == 5)
mapped_rdd = mapped_rdd.map(lambda x: x[1])

end_time2 = time.time()
elapsed_time2 = end_time2 - start_time2
elapsed_times_list.append(elapsed_time2)
print(f"Elapsed time: {elapsed_time2} seconds")


mapped_rdd.count()

Elapsed time: 0.0 seconds


85526

In [33]:
# STEP 3
start_time3 = time.time()

processed_rdd = split_rdd.map(lambda x: (x[2], (float(x[4]), x[11])))
top_reviews_rdd = processed_rdd.reduceByKey(lambda x,y: x if x[0] > y[0] else y)

end_time3 = time.time()
elapsed_time3 = end_time3 - start_time3
elapsed_times_list.append(elapsed_time3)
print(f"Elapsed time: {elapsed_time3} seconds")

top_reviews_rdd.take(1)

Elapsed time: 0.010090827941894531 seconds


[('Pacific Coast Brewing Company', (4.5, 'Killer Whale Stout'))]

In [34]:
# STEP 4
start_time4 = time.time()

processed_rdd = split_rdd.map(lambda x: (x[2], (float(x[4]),1)))
summed_ratings_rdd = processed_rdd.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))
avg_reviews_rdd = summed_ratings_rdd.mapValues(lambda x: round(x[0]/x[1], 3))

end_time4 = time.time()
elapsed_time4 = end_time4 - start_time4
elapsed_times_list.append(elapsed_time4)
print(f"Elapsed time: {elapsed_time4} seconds")

avg_reviews_rdd.take(1)

Elapsed time: 0.023337125778198242 seconds


[('Pacific Coast Brewing Company', 3.432)]

In [35]:
# STEP 5
start_time5 = time.time()

processed_rdd = split_rdd.map(lambda x: (x[8], 1))
counts_rdd = processed_rdd.reduceByKey(lambda x,y: x+y)

end_time5 = time.time()
elapsed_time5 = end_time5 - start_time5
elapsed_times_list.append(elapsed_time5)
print(f"Elapsed time: {elapsed_time5} seconds")

counts_rdd.take(1)

Elapsed time: 0.023610591888427734 seconds


[('Rauchbier', 3758)]

In [36]:
# STEP 6
start_time6 = time.time()

processed_rdd = split_rdd.map(lambda x: (x[8], (float(x[5]), 1)))

sum_per_style_rdd = processed_rdd.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))
# key(sum(review_aroma, count)
sum_per_style_rdd = sum_per_style_rdd.mapValues(lambda x: (x[0], x[1], (x[0]/x[1])))
# key(sum(aroma), count, mean)

squared_diffs = processed_rdd.join(sum_per_style_rdd).mapValues(lambda x: ((x[0][0] - x[1][2]) ** 2, 1))
sum_squared_diffs = squared_diffs.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))
variance_per_style = sum_squared_diffs.mapValues(lambda x: round(x[0]/x[1], 3))

end_time6 = time.time()
elapsed_time6 = end_time6 - start_time6
elapsed_times_list.append(elapsed_time6)
print(f"Elapsed time: {elapsed_time6} seconds")

variance_per_style.take(1)

Elapsed time: 0.034923553466796875 seconds


[('Rauchbier', 0.408)]

In [37]:
end_time_overall = time.time()
elapsed_time_visualisation = end_time_overall - start_time_overall
elapsed_time_no_visualisation = sum(elapsed_times_list)
print(f"Elapsed time: {elapsed_time_no_visualisation} seconds without visualising data")
print(f"Elapsed time: {elapsed_time_visualisation} seconds with visualising data")

Elapsed time: 1.2746515274047852 seconds without visualising data
Elapsed time: 44.17678499221802 seconds with visualising data


In [38]:
sc.stop()