# Spark 101 Exercises

In [74]:
import pandas as pd
import numpy as np
import pyspark
from pydataset import data
spark = pyspark.sql.SparkSession.builder.getOrCreate()
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, expr
from pyspark.sql.functions import month, year, quarter
from pyspark.sql.functions import regexp_extract, regexp_replace

Create a jupyter notebook or python script named spark101 for this exercise.

1. Create a spark data frame that contains your favorite programming languages.

In [2]:
# The name of the column should be language
df = pd.DataFrame({"language": ["python", "java", "javascript", "sql", "ruby"]})

In [3]:
# View the schema of the dataframe
sp_df = spark.createDataFrame(df)
sp_df

DataFrame[language: string]

In [4]:
# Output the shape of the dataframe
sp_df.count()

5

In [5]:
# Show the first 5 records in the dataframe
sp_df.show(5)

+----------+
|  language|
+----------+
|    python|
|      java|
|javascript|
|       sql|
|      ruby|
+----------+



2. Load the mpg dataset as a spark dataframe.

    * Create 1 column of output that contains a message like the one below:

    `The 1999 audi a4 has a 4 cylinder engine.`

   

In [6]:
mpg = spark.createDataFrame(data("mpg"))

In [7]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [73]:
mpg.select(concat(lit("The "), mpg.year, lit(" "), mpg.manufacturer, lit(" "), mpg.model, lit(" has a "), mpg.cyl, lit(" cylinder engine.")).alias("column")).show(5, truncate = False)

+-----------------------------------------+
|column                                   |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



For each vehicle.

    * Transform the trans column so that it only contains either manual or auto.

In [9]:
mpg.filter(mpg["trans"].like("%manual%")).show(5)

+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|        a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual(m6)|  4| 20| 28|  p|compact|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [97]:
mpg.select(
    "trans",
    regexp_extract("trans", r"^(\w+)", 1).alias("transm")
    
).show(5)

+----------+------+
|     trans|transm|
+----------+------+
|  auto(l5)|  auto|
|manual(m5)|manual|
|manual(m6)|manual|
|  auto(av)|  auto|
|  auto(l5)|  auto|
+----------+------+
only showing top 5 rows



In [101]:
mpg = mpg.withColumn(
    "transmission", 
    regexp_extract("trans", r"^(\w+)", 1).alias("transm")

).drop("trans")

In [102]:
mpg.show(5)

+------------+-----+-----+----+---+---+---+---+---+-------+------------+
|manufacturer|model|displ|year|cyl|drv|cty|hwy| fl|  class|transmission|
+------------+-----+-----+----+---+---+---+---+---+-------+------------+
|        audi|   a4|  1.8|1999|  4|  f| 18| 29|  p|compact|        auto|
|        audi|   a4|  1.8|1999|  4|  f| 21| 29|  p|compact|      manual|
|        audi|   a4|  2.0|2008|  4|  f| 20| 31|  p|compact|      manual|
|        audi|   a4|  2.0|2008|  4|  f| 21| 30|  p|compact|        auto|
|        audi|   a4|  2.8|1999|  6|  f| 16| 26|  p|compact|        auto|
+------------+-----+-----+----+---+---+---+---+---+-------+------------+
only showing top 5 rows



3. Load the tips dataset as a spark dataframe.

    * What percentage of observations are smokers?
    * Create a column that contains the tip percentage
    * Calculate the average tip percentage for each combination of sex and smoker.

In [10]:
tips = spark.createDataFrame(data("tips"))

In [11]:
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [12]:
# What percentage of observations are smokers?

tips.groupBy("smoker").agg(count(tips.smoker) / tips.count()).show(5)

+------+---------------------+
|smoker|(count(smoker) / 244)|
+------+---------------------+
|    No|   0.6188524590163934|
|   Yes|  0.38114754098360654|
+------+---------------------+



In [13]:
# Create a column that contains the tip percentage
tip_percentage = tips.select(tips.tip / tips.total_bill)

In [62]:
tips = tips.withColumn(
    "tip_percentage", expr("Round((tip / total_bill) * 100)")
)

tips.show()

+----------+----+------+------+---+------+----+--------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_percentage|
+----------+----+------+------+---+------+----+--------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|           6.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|          16.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|          17.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|          14.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|          15.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|          19.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|          23.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|          12.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|          13.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|          22.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|          17.0|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|          14.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|        

In [16]:
# Calculate the average tip 
# percentage for each combination of sex and smoker.

tips.groupBy(tips.sex, tips.smoker).agg(avg(tips.tip)).show()

+------+------+-----------------+
|   sex|smoker|         avg(tip)|
+------+------+-----------------+
|  Male|    No| 3.11340206185567|
|  Male|   Yes|3.051166666666667|
|Female|    No|2.773518518518518|
|Female|   Yes|2.931515151515151|
+------+------+-----------------+



4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

    * Convert the temperatures to farenheight.
    * Which month has the most rain, on average?
    * Which year was the windiest?
    * What is the most frequent type of weather in January?
    * What is the average high and low tempurature on sunny days in July in 2013 and 2014?
    * What percentage of days were rainy in q3 of 2015?
    * For each year, find what percentage of days it rained (had non-zero precipitation).

In [17]:
from vega_datasets import data

In [18]:
weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))

In [19]:
weather = spark.createDataFrame(weather)

In [20]:
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [22]:
# convert the temperatures to farenheight
weather = weather.withColumn(
    "temp_max_f", expr("Round(temp_max * (9/5) + 32)")
).withColumn(
    "temp_min_f", expr("Round(temp_min * (9/5) + 32)")
).drop("temp_max", "temp_min")

In [23]:
weather.show(5)

+----------+-------------+----+-------+----------+----------+
|      date|precipitation|wind|weather|temp_max_f|temp_min_f|
+----------+-------------+----+-------+----------+----------+
|2012-01-01|          0.0| 4.7|drizzle|      55.0|      41.0|
|2012-01-02|         10.9| 4.5|   rain|      51.0|      37.0|
|2012-01-03|          0.8| 2.3|   rain|      53.0|      45.0|
|2012-01-04|         20.3| 4.7|   rain|      54.0|      42.0|
|2012-01-05|          1.3| 6.1|   rain|      48.0|      37.0|
+----------+-------------+----+-------+----------+----------+
only showing top 5 rows



In [29]:
# Which month has the most rain, on average?

( 
    weather.withColumn("month", month("date"))
    .groupBy("month")
    .agg(sum("precipitation").alias("total_rain"))
    .sort("month")
    .show()
    
)

# It seems that the answer is Nov.

+-----+------------------+
|month|        total_rain|
+-----+------------------+
|    1|465.99999999999994|
|    2|             422.0|
|    3|             606.2|
|    4|             375.4|
|    5|             207.5|
|    6|             132.9|
|    7|              48.2|
|    8|             163.7|
|    9|235.49999999999997|
|   10|             503.4|
|   11|             642.5|
|   12| 622.7000000000002|
+-----+------------------+



In [32]:
# Which year was the windiest?

( 
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(sum("wind").alias("total_wind"))
    .sort("year")
    .show()
    
)

# 2012 was the windiest year

+----+------------------+
|year|        total_wind|
+----+------------------+
|2012|            1244.7|
|2013|1100.8000000000006|
|2014|1236.5000000000007|
|2015|1153.3000000000002|
+----+------------------+



In [35]:
# What is the most frequent type of weather in January?

( 
    weather.withColumn("month", month("date"))
    .groupBy("month", weather.weather)
    .agg(count("weather"))
    .sort("month")
    .show(5)
    
)

# fog seems to be the most common type of weather

+-----+-------+--------------+
|month|weather|count(weather)|
+-----+-------+--------------+
|    1|   snow|             8|
|    1|   rain|            35|
|    1|drizzle|            10|
|    1|    sun|            33|
|    1|    fog|            38|
+-----+-------+--------------+
only showing top 5 rows



In [55]:
# What is the average high and low 
# tempurature on sunny days in July in 2013 and 2014?

(
    weather.filter(month("date") == 7)
    .filter(year("date") >= 2013)
    .filter(year("date") <= 2014)
    .filter(weather.weather == "sun")
    .groupBy("weather")
    .agg(mean("temp_max_f"), mean("temp_min_f"))
    .show()
)

+-------+-----------------+-----------------+
|weather|  avg(temp_max_f)|  avg(temp_min_f)|
+-------+-----------------+-----------------+
|    sun|80.28846153846153|57.53846153846154|
+-------+-----------------+-----------------+



In [45]:
# What percentage of days were rainy in q3 of 2015?

(

    weather.filter(year('date') == 2015)
    .filter(quarter("date") == 3)
    .filter(weather.weather == "rain")
    .groupby("weather")
    .agg(count("date") / 92)
    .show()
)

+-------+--------------------+
|weather|  (count(date) / 92)|
+-------+--------------------+
|   rain|0.021739130434782608|
+-------+--------------------+



In [53]:
# For each year, find what percentage of 
# days it rained (had non-zero precipitation)

(

    weather.withColumn("year", year("date"))
    .filter(weather.precipitation > 0)
    .groupBy("year")
    .agg(count("date") / 365.25)
    .sort("year")
    .show()
)

+----+----------------------+
|year|(count(date) / 365.25)|
+----+----------------------+
|2012|   0.48459958932238195|
|2013|    0.4161533196440794|
|2014|    0.4106776180698152|
|2015|    0.3942505133470226|
+----+----------------------+

