## Spark Exercises

In [6]:
import pandas as pd
import pyspark
from pydataset import data
from vega_datasets import data as v_data
import pyspark.sql.functions as F

### I. Create a spark data frame that contains your favorite programming languages.

1) The name of the column should be language

In [7]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [14]:
df = spark.createDataFrame(pd.DataFrame(['python', 'javascript','matlab','sql','javascript','r'], columns=['language']))
df

DataFrame[language: string]

In [15]:
df.show()

+----------+
|  language|
+----------+
|    python|
|javascript|
|    matlab|
|       sql|
|javascript|
|         r|
+----------+



2) View the schema of the dataframe

In [10]:
df.printSchema()

root
 |-- language: string (nullable = true)



3) Output the shape of the dataframe

In [11]:
df.count(), len(df.columns)

(7, 1)

4) Show the first 5 records in the dataframe

In [12]:
df.show(5)

+--------+
|language|
+--------+
|  python|
|       r|
|    java|
|  c_plus|
|    ruby|
+--------+
only showing top 5 rows



### II. Load the mpg dataset as a spark dataframe.

In [16]:
mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



1) Create 1 column of output that contains a message like the one below:

> The 1999 audi a4 has a 4 cylinder engine.

For each vehicle.

In [17]:
col = F.concat(F.lit('The '), 
                    mpg.year, F.lit(' '), 
                    mpg.manufacturer, 
                    F.lit(' '), 
                    mpg.model, 
                    F.lit(' has a '), 
                    mpg.cyl,
                    F.lit(' cylinder engine.')).alias('description')
mpg.select(col).show(5,truncate=False)

+-----------------------------------------+
|description                              |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



2) Transform the trans column so that it only contains either manual or auto.

In [18]:
simple_trans = F.regexp_extract(mpg.trans, 
                                r'(^\w+?)\(.*\)', 
                                1).alias('simple_trans')
mpg.select(simple_trans).show(5)

+------------+
|simple_trans|
+------------+
|        auto|
|      manual|
|      manual|
|        auto|
|        auto|
+------------+
only showing top 5 rows



### III. Load the tips dataset as a spark dataframe.

In [19]:
tips = spark.createDataFrame(data('tips'))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



1) What percentage of observations are smokers?

In [22]:
smoker_or_no = F.when(tips.smoker == 'Yes', 1).otherwise(0)
smoker_or_no

Column<b'CASE WHEN (smoker = Yes) THEN 1 ELSE 0 END'>

In [23]:
tips.select(smoker_or_no).show()

+------------------------------------------+
|CASE WHEN (smoker = Yes) THEN 1 ELSE 0 END|
+------------------------------------------+
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|                                         0|
|         