In [1]:
#pyspark library
import pyspark

#creating spark object (necessary to creating spark dfs)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

import numpy as np
import pandas as pd

from pydataset import data

from pyspark.sql.functions import sum, mean, min, max, concat, lit, regexp_extract, regexp_replace, when, asc, desc, col, expr
from pyspark.sql.functions import month, year, quarter

### 1. Create a spark data frame that contains your favorite programming languages.

#### The name of the column should be `language`

In [2]:
pandas_dataframe = pd.DataFrame({"language": ['python', 'html', 'java', 'c++', 'javascript', 'typescript', 'spark']})

df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[language: string]

#### View the schema of the dataframe

In [3]:
df.printSchema()

root
 |-- language: string (nullable = true)



#### Output the shape of the dataframe

In [4]:
print((df.count(), len(df.columns)))

(7, 1)


#### Show the first 5 records in the dataframe

In [5]:
df.show(5)

+----------+
|  language|
+----------+
|    python|
|      html|
|      java|
|       c++|
|javascript|
+----------+
only showing top 5 rows



### 2. Load the `mpg` dataset as a spark dataframe.

In [6]:
#loading mpg dataset from pydataset
mpg = spark.createDataFrame(data('mpg'))
mpg

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

In [7]:
#mpg schema
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [8]:
#mpg shape
print((mpg.count(), len(mpg.columns)))

(234, 11)


In [9]:
#first 5 rows
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



#### a. Create 1 column of output that contains a message like the one below:

`The 1999 audi a4 has a 4 cylinder engine.`


For each vehicle.

In [10]:
mpg.select(concat(lit('The '), 
                  mpg.year, 
                  lit(' '), 
                  mpg.manufacturer,
                  lit(' '),
                  mpg.model,
                  lit(' has a '), 
                  mpg.cyl, 
                  lit(' cylinder engine.')).alias('message')).show(5)


+--------------------+
|             message|
+--------------------+
|The 1999 audi a4 ...|
|The 1999 audi a4 ...|
|The 2008 audi a4 ...|
|The 2008 audi a4 ...|
|The 1999 audi a4 ...|
+--------------------+
only showing top 5 rows



#### b. Transform the trans column so that it only contains either manual or auto.

In [11]:
mpg.select(
        'trans',
        regexp_replace('trans', r'\(\w+\)$', '').alias('transformed_trans')).show()

+----------+-----------------+
|     trans|transformed_trans|
+----------+-----------------+
|  auto(l5)|             auto|
|manual(m5)|           manual|
|manual(m6)|           manual|
|  auto(av)|             auto|
|  auto(l5)|             auto|
|manual(m5)|           manual|
|  auto(av)|             auto|
|manual(m5)|           manual|
|  auto(l5)|             auto|
|manual(m6)|           manual|
|  auto(s6)|             auto|
|  auto(l5)|             auto|
|manual(m5)|           manual|
|  auto(s6)|             auto|
|manual(m6)|           manual|
|  auto(l5)|             auto|
|  auto(s6)|             auto|
|  auto(s6)|             auto|
|  auto(l4)|             auto|
|  auto(l4)|             auto|
+----------+-----------------+
only showing top 20 rows



### 3. Load the tips dataset as a spark dataframe.

In [12]:
tips = spark.createDataFrame(data('tips'))
tips.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



#### a. What percentage of observations are smokers?

In [13]:
(tips.where(tips.smoker == 'Yes').count() / (tips.count())) * 100

38.114754098360656

In [14]:
#alternate w/ sql 
tips.createOrReplaceTempView("tips")

spark.sql('''
SELECT ((SELECT COUNT(smoker)
FROM tips
WHERE smoker = 'Yes') / 
(SELECT COUNT(smoker)
FROM tips)) AS pct_smokers
FROM tips
''').show(1)

+-------------------+
|        pct_smokers|
+-------------------+
|0.38114754098360654|
+-------------------+
only showing top 1 row



#### b. Create a column that contains the tip percentage

In [15]:
tips = tips.select('*', (tips.tip / tips.total_bill).alias('tip-pct'))
tips.show(3)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip-pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
+----------+----+------+------+---+------+----+-------------------+
only showing top 3 rows



In [16]:
#alternate w/ sql
spark.sql('''
SELECT *, 
(tip / total_bill) AS tip_pct
FROM tips
''').show(3)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
+----------+----+------+------+---+------+----+-------------------+
only showing top 3 rows



#### c. Calculate the average tip percentage for each combination of sex and smoker.

In [17]:
tips.groupby('sex').pivot('smoker').mean('tip-pct').show()

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941035|
|  Male|0.1606687151291298| 0.1527711752024851|
+------+------------------+-------------------+



In [18]:
#alternate
tips.groupBy('sex', 'smoker').agg(mean('tip-pct').alias('avg')).show()

+------+------+-------------------+
|   sex|smoker|                avg|
+------+------+-------------------+
|  Male|    No| 0.1606687151291298|
|  Male|   Yes| 0.1527711752024851|
|Female|    No| 0.1569209707691836|
|Female|   Yes|0.18215035269941035|
+------+------+-------------------+



### 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [19]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



#### Convert the temperatures to farenheight.

In [20]:
#define function to convert temps to f
def cel_to_far(celsius):
    farenheight = (celsius * (9/5)) + 32
    return farenheight

In [21]:
weather.select(
    '*',
    cel_to_far(weather.temp_max).alias('temp_max_f'),
    cel_to_far(weather.temp_min).alias('temp_min_f')).show(5)

+----------+-------------+--------+--------+----+-------+------------------+----------+
|      date|precipitation|temp_max|temp_min|wind|weather|        temp_max_f|temp_min_f|
+----------+-------------+--------+--------+----+-------+------------------+----------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|55.040000000000006|      41.0|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|             51.08|     37.04|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|             53.06|     44.96|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|             53.96|     42.08|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|48.019999999999996|     37.04|
+----------+-------------+--------+--------+----+-------+------------------+----------+
only showing top 5 rows



#### Which month has the most rain, on average?

In [22]:
(weather.withColumn('month', month('date'))
    .groupBy('month')
    .agg(mean('precipitation').alias('avg_rainfall'))
    .sort(desc('avg_rainfall'))
    .show(1))

+-----+-----------------+
|month|     avg_rainfall|
+-----+-----------------+
|   11|5.354166666666667|
+-----+-----------------+
only showing top 1 row



#### Which year was the windiest?

In [23]:
(weather.withColumn('year', year('date'))
     .groupBy('year')
     .agg(mean('wind').alias('avg_wind'))
     .sort(desc('avg_wind'))
     .show(1))

+----+-----------------+
|year|         avg_wind|
+----+-----------------+
|2012|3.400819672131147|
+----+-----------------+
only showing top 1 row



In [24]:
#alternate 
(weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(mean("wind").alias("Avg_Wind"))
    .sort("year")
    .show())

+----+------------------+
|year|          Avg_Wind|
+----+------------------+
|2012| 3.400819672131147|
|2013|3.0158904109589044|
|2014|3.3876712328767136|
|2015|  3.15972602739726|
+----+------------------+



#### What is the most frequent type of weather in January?

In [25]:
weather = weather.withColumn('month', month('date')).withColumn('year', year('date')).withColumn('quarter', quarter('date'))

In [26]:
weather.crosstab('month', 'weather').sort('month_weather', ascending=True).show()

+-------------+-------+---+----+----+---+
|month_weather|drizzle|fog|rain|snow|sun|
+-------------+-------+---+----+----+---+
|            1|     10| 38|  35|   8| 33|
|           10|      4| 55|  20|   0| 45|
|           11|      3| 50|  25|   0| 42|
|           12|      2| 54|  23|   5| 40|
|            2|      4| 36|  40|   3| 30|
|            3|      3| 36|  37|   6| 42|
|            4|      4| 34|  20|   1| 61|
|            5|      1| 25|  16|   0| 82|
|            6|      2| 14|  19|   0| 85|
|            7|      8| 13|  14|   0| 89|
|            8|      8| 16|   6|   0| 94|
|            9|      5| 40|   4|   0| 71|
+-------------+-------+---+----+----+---+



#### What is the average high and low temperature on sunny days in July in 2013 and 2014?

In [27]:
weather.filter(expr('(year == 2013 or year == 2014) and month == 07')).groupby('weather').pivot('year').mean('temp_min').show()


+-------+------------------+------------------+
|weather|              2013|              2014|
+-------+------------------+------------------+
|    fog|13.133333333333335|14.440000000000001|
|   rain|              15.0|              15.0|
|    sun|13.981481481481483|14.400000000000002|
+-------+------------------+------------------+



In [28]:
weather.filter(expr('(year == 2013 or year == 2014) and month == 07')).groupby('weather').pivot('year').mean('temp_max').show()


+-------+------------------+------------------+
|weather|              2013|              2014|
+-------+------------------+------------------+
|    fog| 22.96666666666667|25.439999999999998|
|   rain|              22.2|              29.4|
|    sun|26.585185185185193|            27.092|
+-------+------------------+------------------+



#### What percentage of days were rainy in q3 of 2015?

In [29]:
q3_15 = weather.filter(expr('year == 2015 and quarter == 3'))


In [30]:
q3_15.where(weather.weather == 'rain').count() / q3_15.count()


0.021739130434782608

#### For each year, find what percentage of days it rained (had non-zero precipitation).

In [31]:
(weather.filter(expr('year == 2012')).filter(expr('precipitation > 0')).count()) / (weather.filter(expr('year == 2012')).count())


0.48360655737704916

In [32]:
(weather.filter(expr('year == 2013')).filter(expr('precipitation > 0')).count()) / (weather.filter(expr('year == 2013')).count())


0.41643835616438357

In [33]:
(weather.filter(expr('year == 2014')).filter(expr('precipitation > 0')).count()) / (weather.filter(expr('year == 2014')).count())


0.410958904109589

In [34]:
(weather.filter(expr('year == 2015')).filter(expr('precipitation > 0')).count()) / (weather.filter(expr('year == 2015')).count())


0.39452054794520547