In [1]:
import numpy as np
import pyspark
from pyspark.sql.functions import *
import pandas as pd
import pydataset

spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Exercises
## Create a directory named spark within your ds-methodologies repository. 
### This is where you will do the exercises for this module.

### Create a jupyter notebook or python script named spark101 for this exercise.

#### 1. Create a spark data frame that contains your favorite programming languages.
    - The name of the column should be language
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe

In [2]:
my_favorite_languages = {
    "language": ["Python", "C Programming", "Java", "C++", "JavaScript", "HTML"]
}

In [3]:
my_favorite_languages_df = pd.DataFrame(my_favorite_languages)
my_favorite_languages_df

Unnamed: 0,language
0,Python
1,C Programming
2,Java
3,C++
4,JavaScript
5,HTML


In [4]:
df = spark.createDataFrame(pd.DataFrame(my_favorite_languages_df))

In [5]:
# view my_favorite_languages_spark_df dataframe
df.show()

+-------------+
|     language|
+-------------+
|       Python|
|C Programming|
|         Java|
|          C++|
|   JavaScript|
|         HTML|
+-------------+



In [6]:
# view my_favorite_languages_spark_df Schema
df.printSchema()

root
 |-- language: string (nullable = true)



In [7]:
# view my_favorite_languages_spark_df shape
print("DataFrame shape: ", df.count(), " x ", len(df.columns))

DataFrame shape:  6  x  1


In [8]:
# First 5 records
df.show(5)

+-------------+
|     language|
+-------------+
|       Python|
|C Programming|
|         Java|
|          C++|
|   JavaScript|
+-------------+
only showing top 5 rows



#### 2. Load the mpg dataset as a spark dataframe.

In [9]:
mpg = spark.createDataFrame(pydataset.data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



> a. Create 1 column of output that contains a message like the one below:

`The 1999 audi a4 has a 4 cylinder engine.`

    For each vehicle.

In [10]:
mpg.select(
    concat(
        lit("The "),
        col("year"),
        lit(" "),
        col("manufacturer"),
        lit(" "),
        col("model"),
        lit("has a "),
        col("cyl"),
        lit("cylinder engine."),
    ).alias("vehicle_cylinder-desc")
).show(truncate=False)

+------------------------------------------------------------+
|vehicle_cylinder-desc                                       |
+------------------------------------------------------------+
|The 1999 audi a4has a 4cylinder engine.                     |
|The 1999 audi a4has a 4cylinder engine.                     |
|The 2008 audi a4has a 4cylinder engine.                     |
|The 2008 audi a4has a 4cylinder engine.                     |
|The 1999 audi a4has a 6cylinder engine.                     |
|The 1999 audi a4has a 6cylinder engine.                     |
|The 2008 audi a4has a 6cylinder engine.                     |
|The 1999 audi a4 quattrohas a 4cylinder engine.             |
|The 1999 audi a4 quattrohas a 4cylinder engine.             |
|The 2008 audi a4 quattrohas a 4cylinder engine.             |
|The 2008 audi a4 quattrohas a 4cylinder engine.             |
|The 1999 audi a4 quattrohas a 6cylinder engine.             |
|The 1999 audi a4 quattrohas a 6cylinder engine.       

> b. Transform the trans column so that it only contains either manual or auto.

In [11]:
mpg.select(
    when(mpg.trans.like("auto%"), "auto").otherwise("manual").alias("trans_when")
).show()

+----------+
|trans_when|
+----------+
|      auto|
|    manual|
|    manual|
|      auto|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|      auto|
|      auto|
|      auto|
|      auto|
+----------+
only showing top 20 rows



#### 3. Load the `tips` dataset as a spark dataframe.

    a. What percentage of observations are smokers?
    b. Create a column that contains the tip percentage
    c. Calculate the average tip percentage for each combination of sex and smoker.

In [12]:
tips = spark.createDataFrame(pydataset.data("tips"))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



- What percentage of observations are smokers?

In [13]:
tips.groupBy("smoker").count().withColumn(
    "percent",
    concat(round((col("count") / tips.count() * 100), 0).cast("int"), lit("%")),
).show()

+------+-----+-------+
|smoker|count|percent|
+------+-----+-------+
|    No|  151|    62%|
|   Yes|   93|    38%|
+------+-----+-------+



- Create a column that contains the tip percentage

In [14]:
tips = tips.withColumn("tip_percent", round((col("tip") / col("total_bill") * 100), 2))
tips.show(5)

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_percent|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|       5.94|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|      16.05|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|      16.66|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|      13.98|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|      14.68|
+----------+----+------+------+---+------+----+-----------+
only showing top 5 rows



- Calculate the avg tip % for each combo of sex and smoker

In [15]:
tips.groupBy("smoker", "sex").agg(
    mean(col("tip_percent")).alias("avg_tip_percent")
).show()

+------+------+------------------+
|smoker|   sex|   avg_tip_percent|
+------+------+------------------+
|    No|Female|15.691111111111113|
|    No|  Male|16.066597938144326|
|   Yes|  Male|15.276666666666666|
|   Yes|Female| 18.21454545454545|
+------+------+------------------+



#### 3. Use the `seattle weather dataset` referenced in the lesson to answer the questions below.

In [16]:
from vega_datasets import data

weather = data.seattle_weather()
weather = spark.createDataFrame(weather)
weather.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



a. Convert the temperatures to farenheight.
$$(0^\circ C * 9/5) + 32 = 32^\circ F$$

In [17]:
weather = weather.withColumn("temp_max", (col("temp_max") * 9 / 5 + 32)).withColumn(
    "temp_min", (col("temp_min") * 9 / 5 + 32)
)

In [18]:
weather.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|   37.04| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



b. Which month has the most rain, on average?

In [19]:
(
    weather.groupBy(month("date").alias("month"), year("date").alias("year"))
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupBy("month")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
    .first()
)

Row(month=11, avg_monthly_rain=160.625)

c. Which year was the windiest?

In [20]:
(
    weather.groupBy(year("date").alias("year"))
    .agg(sum("wind").alias("total_winds"))
    .sort(col("total_winds").desc())
    .first()
)

Row(year=2012, total_winds=1244.7000000000003)

d. What is the most frequent type of weather in January?

In [21]:
(
    weather.withColumn("month", month("date"))
    .filter(col("month") == 1)
    .groupBy("weather")
    .count()
    .sort(col("count").desc())
    .show()
)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



e. What is the average high and low tempurature on sunny days in July in 2013 and 2014?

In [22]:
(
    weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year("date") < 2015)
    .filter(col("weather") == lit("sun"))
    .agg(
        avg("temp_max").alias("average_high_temp"),
        avg("temp_min").alias("average_low_temp"),
    )
    .show()
)

+-----------------+-----------------+
|average_high_temp| average_low_temp|
+-----------------+-----------------+
|80.29192307692308|57.52884615384615|
+-----------------+-----------------+



f. What percentage of days were rainy in q3 of 2015?

In [23]:
# measure a rainy day by weather == rain
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
    .agg(mean("rain"))
    .show()
)

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



g. For each year, find what percentage of days it rained (had non-zero precipitation).

In [24]:
(
    weather.withColumn("rain", (when(col("precipitation") > 0, 1).otherwise(0)))
    .groupBy(year("date").alias("year"))
    .agg(mean(col("rain")).alias("pct_days_with_rain"))
    .show()
)

+----+-------------------+
|year| pct_days_with_rain|
+----+-------------------+
|2015|0.39452054794520547|
|2013|0.41643835616438357|
|2014|  0.410958904109589|
|2012|0.48360655737704916|
+----+-------------------+

