# Spark 101 Exercises

In [106]:
import pyspark
import pandas as pd

from pydataset import data
from vega_datasets import data

from pyspark.sql.functions import lit, concat
from pyspark.sql.functions import regexp_extract, regexp_replace, when

## Exercise 1
Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language

In [8]:
pd_df = pd.DataFrame({ "language" : ['python', 'sql', 'java', 'javascript', 'c', 'r', 'c++', 'swift', 'c#']
    
})

In [9]:
pd_df

Unnamed: 0,language
0,python
1,sql
2,java
3,javascript
4,c
5,r
6,c++
7,swift
8,c#


In [11]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [12]:
sp_df = spark.createDataFrame(pd_df)

- View the schema of the dataframe

In [22]:
sp_df.printSchema()

root
 |-- language: string (nullable = true)



In [20]:
sp_df.describe().show()

+-------+--------+
|summary|language|
+-------+--------+
|  count|       9|
|   mean|    null|
| stddev|    null|
|    min|       c|
|    max|   swift|
+-------+--------+



- Output the shape of the dataframe

In [25]:
sp_df.columns

['language']

In [26]:
sp_df.count()

9

- Show the first 5 records in the dataframe

In [30]:
sp_df.show(5)

+----------+
|  language|
+----------+
|    python|
|       sql|
|      java|
|javascript|
|         c|
+----------+
only showing top 5 rows



## Exercise 2
Load the mpg dataset as a spark dataframe.

In [32]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



Create 1 column of output that contains a message like the one below:


The 1999 audi a4 has a 4 cylinder engine.
For each vehicle.

In [59]:
mpg.select(concat(lit("The "), mpg.year, lit(" "), mpg.manufacturer, lit(" "), mpg.model, lit(" has a "), mpg.cyl, lit(" cylinder engine."))).show(truncate=False)

+------------------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  has a , cyl,  cylinder engine.)|
+------------------------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                                     |
|The 1999 audi a4 has a 4 cylinder engine.                                     |
|The 2008 audi a4 has a 4 cylinder engine.                                     |
|The 2008 audi a4 has a 4 cylinder engine.                                     |
|The 1999 audi a4 has a 6 cylinder engine.                                     |
|The 1999 audi a4 has a 6 cylinder engine.                                     |
|The 2008 audi a4 has a 6 cylinder engine.                                     |
|The 1999 audi a4 quattro has a 4 cylinder engine.                             |
|The 1999 audi a4 quattro has a 4 cylinder engine.                             |
|The 2008 audi a4 quattro ha

Transform the trans column so that it only contains either manual or auto.

In [102]:
mpg.select(regexp_replace("trans", r"\([^)]*\)", "").alias("transonly")).show()

+---------+
|transonly|
+---------+
|     auto|
|   manual|
|   manual|
|     auto|
|     auto|
|   manual|
|     auto|
|   manual|
|     auto|
|   manual|
|     auto|
|     auto|
|   manual|
|     auto|
|   manual|
|     auto|
|     auto|
|     auto|
|     auto|
|     auto|
+---------+
only showing top 20 rows



In [105]:
mpg = mpg.withColumn("trans", when(mpg.trans.startswith("a"), "auto").otherwise("manual")).show(6)

+------------+-----+-----+----+---+------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|   a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
+------------+-----+-----+----+---+------+---+---+---+---+-------+
only showing top 6 rows



## Exercise 3
Load the tips dataset as a spark dataframe.

What percentage of observations are smokers?
Create a column that contains the tip percentage
Calculate the average tip percentage for each combination of sex and smoker.

In [None]:
tips.groupBy('smoker').agg(round(count(tips.smoker)/ tips.count(),2)).show()

In [108]:
tips.groupBy(tips.smoker, tips.sex).agg(round(avg(tips.tip/tips.total_bill * 100), 2)).show()

NameError: name 'tips' is not defined

In [None]:
tips.filter(tips.smoker == "Yes").count()/tips.count()

In [None]:
tips = tips.withColumn('tip_percentage', expr('Round((tip/total_bill) * 100)'))

## Exercise4
Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [120]:
seattle = spark.createDataFrame(data.seattle_weather())

In [124]:
seattle.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|     4.4|     2.2| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|     7.2|     2.8| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    10.0|     2.8| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|     9.4|     5.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|     6.1|     0.6| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|     6.1|    -1.1| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|     6.1|    -1.7| 1.9|    

Convert the temperatures to farenheight.
Which month has the most rain, on average?
Which year was the windiest?
What is the most frequent type of weather in January?

In [121]:
seattle.createOrReplaceTempView("seattle")

In [122]:
spark.sql(
"""
SELECT weather, count(weather)
FROM temp
where month(date) = 1
group by weather

"""
).show(5)

AnalysisException: Table or view not found: temp; line 3 pos 5;
'Aggregate ['weather], ['weather, unresolvedalias('count('weather), None)]
+- 'Filter ('month('date) = 1)
   +- 'UnresolvedRelation [temp]


What is the average high and low temperature on sunny days in July in 2013 and 2014?
What percentage of days were rainy in q3 of 2015?
For each year, find what percentage of days it rained (had non-zero precipitation).

In [None]:
weather.filter(month("date") == 1).groupBy("weather").agg(count("weather")).show()