In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/05 11:52:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Exercises
## 1. Create a spark data frame that contains your favorite programming languages.

    The name of the column should be language
    View the schema of the dataframe
    Output the shape of the dataframe
    Show the first 5 records in the dataframe

In [2]:
# create a list of languages in order to create a dataframe with the column language
language_list = ['python', 'java', 'c', 'c++', 'pascal', 'basic', 'perl']
pandas_df = pd.DataFrame({'language':language_list})
pandas_df

Unnamed: 0,language
0,python
1,java
2,c
3,c++
4,pascal
5,basic
6,perl


In [3]:
# make the spark dataframe
df = spark.createDataFrame(pandas_df)
df

DataFrame[language: string]

In [4]:
df.show()

                                                                                

+--------+
|language|
+--------+
|  python|
|    java|
|       c|
|     c++|
|  pascal|
|   basic|
|    perl|
+--------+



In [5]:
df.describe().show()

[Stage 3:>                                                          (0 + 8) / 8]

+-------+--------+
|summary|language|
+-------+--------+
|  count|       7|
|   mean|    null|
| stddev|    null|
|    min|   basic|
|    max|  python|
+-------+--------+



                                                                                

In [6]:
df.printSchema()

root
 |-- language: string (nullable = true)



In [7]:
df.dtypes

[('language', 'string')]

In [8]:
# shape: rows x columns
df.count(), len(df.columns)

(7, 1)

# Exercises
## 2. Load the mpg dataset as a spark dataframe.

    a. Create 1 column of output that contains a message like the one below:

        The 1999 audi a4 has a 4 cylinder engine.

    For each vehicle.

    b. Transform the trans column so that it only contains either manual or auto.

In [9]:
from pydataset import data

# create spark df with mpg data
mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [10]:
mpg.groupBy('manufacturer').count().show()

[Stage 10:>                                                         (0 + 8) / 8]

+------------+-----+
|manufacturer|count|
+------------+-----+
|        audi|   18|
|   chevrolet|   19|
|       dodge|   37|
|        ford|   25|
|     hyundai|   14|
|       honda|    9|
|  land rover|    4|
|     lincoln|    3|
|        jeep|    8|
|     mercury|    4|
|      nissan|   13|
|     pontiac|    5|
|      toyota|   34|
|      subaru|   14|
|  volkswagen|   27|
+------------+-----+



                                                                                

In [11]:
# import functions from pyspark.sql.functions
from pyspark.sql.functions import concat, col, lit, expr

In [12]:
# add a new column, description, with a sentence formed from year, make, model, cylinder
mpg.withColumn(
    'description',
    concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer, lit(' '), mpg.model, lit(' has a '),
           mpg.cyl, lit(' cylinder engine.') )
).show(15, truncate=False)

+------------+----------+-----+----+---+----------+---+---+---+---+-------+-------------------------------------------------+
|manufacturer|model     |displ|year|cyl|trans     |drv|cty|hwy|fl |class  |description                                      |
+------------+----------+-----+----+---+----------+---+---+---+---+-------+-------------------------------------------------+
|audi        |a4        |1.8  |1999|4  |auto(l5)  |f  |18 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.        |
|audi        |a4        |1.8  |1999|4  |manual(m5)|f  |21 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.        |
|audi        |a4        |2.0  |2008|4  |manual(m6)|f  |20 |31 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.        |
|audi        |a4        |2.0  |2008|4  |auto(av)  |f  |21 |30 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.        |
|audi        |a4        |2.8  |1999|6  |auto(l5)  |f  |16 |26 |p  |compact|The 1999 audi a4 has a 6 cylinder engine.  

In [13]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [14]:
# get a couple strings out of the trans column to experiment on with regex
auto_string = mpg.first()['trans']
man_string = mpg.collect()[1]['trans']
auto_string, man_string

('auto(l5)', 'manual(m5)')

In [15]:
import re
from pyspark.sql.functions import regexp_extract, regexp_replace

In [16]:
# goal is to transform trans so that contents are only 'auto' or 'manual'
re.sub(r'\(.+\)', '', man_string)

'manual'

In [17]:
# make a column with only 'auto' or 'manual' from the trans column
mpg.select(
    mpg.trans,
    regexp_replace('trans', r'\(.+\)', '').alias('auto_or_manual')
).show()

+----------+--------------+
|     trans|auto_or_manual|
+----------+--------------+
|  auto(l5)|          auto|
|manual(m5)|        manual|
|manual(m6)|        manual|
|  auto(av)|          auto|
|  auto(l5)|          auto|
|manual(m5)|        manual|
|  auto(av)|          auto|
|manual(m5)|        manual|
|  auto(l5)|          auto|
|manual(m6)|        manual|
|  auto(s6)|          auto|
|  auto(l5)|          auto|
|manual(m5)|        manual|
|  auto(s6)|          auto|
|manual(m6)|        manual|
|  auto(l5)|          auto|
|  auto(s6)|          auto|
|  auto(s6)|          auto|
|  auto(l4)|          auto|
|  auto(l4)|          auto|
+----------+--------------+
only showing top 20 rows



# Exercises
## 3. Load the tips dataset as a spark dataframe.

    What percentage of observations are smokers?
    Create a column that contains the tip percentage
    Calculate the average tip percentage for each combination of sex and smoker.

In [18]:
tips = spark.createDataFrame(data('tips'))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [19]:
tips.count(), len(tips.columns)

(244, 7)

In [20]:
# groupby('column name').count().show() is a little bit like value_counts()
tips.groupby('smoker').count().show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



In [21]:
tips.filter(tips.smoker == 'Yes').count() / tips.count()

0.38114754098360654

In [22]:
temp = tips.groupby('smoker').count()
temp.show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



In [23]:
total = tips.count()
percent = col('count') / total * 100
percent

Column<'((count / 244) * 100)'>

In [24]:
from pyspark.sql.functions import round, avg

In [25]:
tips.groupby('smoker').count().select('smoker', 'count', round(percent,1).alias('percent')).show()

+------+-----+-------+
|smoker|count|percent|
+------+-----+-------+
|    No|  151|   61.9|
|   Yes|   93|   38.1|
+------+-----+-------+



In [26]:
# add a column to tips with the tip percentage
tips = tips.withColumn(
    'tip_percentage',
    round((tips.tip / tips.total_bill) * 100, 1)
)
tips.show()

+----------+----+------+------+---+------+----+--------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_percentage|
+----------+----+------+------+---+------+----+--------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|           5.9|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|          16.1|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|          16.7|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|          14.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|          14.7|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|          18.6|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|          22.8|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|          11.6|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|          13.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|          21.9|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|          16.7|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|          14.2|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|        

In [27]:
# average tip percentage for each combination of sex and smoker
tips.groupby('sex','smoker').agg(round(avg(tips.tip_percentage),1).alias('avg tip')).show()

+------+------+-------+
|   sex|smoker|avg tip|
+------+------+-------+
|  Male|    No|   16.1|
|Female|    No|   15.7|
|  Male|   Yes|   15.3|
|Female|   Yes|   18.2|
+------+------+-------+



# Exercises
## 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

    Convert the temperatures to fahrenheit.
    Which month has the most rain, on average?
    Which year was the windiest?
    What is the most frequent type of weather in January?
    What is the average high and low temperature on sunny days in July in 2013 and 2014?
    What percentage of days were rainy in q3 of 2015?
    For each year, find what percentage of days it rained (had non-zero precipitation).

In [28]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [29]:
# Convert temperatures from C to F in the temp_max and temp_min columns

# I was trying to avoid doing it this way because if I had a lot of columns it would not be ideal,
# BUT, this works, and nothing else I was trying was working
weather = weather.select(
    'date', 'precipitation', 
    expr('round(temp_max * 9 / 5 + 32, 1)').alias('temp_max_f'),
    expr('round(temp_min * 9 / 5 + 32, 1)').alias('temp_min_f'),
    'wind', 'weather'
)

weather.show()

+----------+-------------+----------+----------+----+-------+
|      date|precipitation|temp_max_f|temp_min_f|wind|weather|
+----------+-------------+----------+----------+----+-------+
|2012-01-01|          0.0|      55.0|      41.0| 4.7|drizzle|
|2012-01-02|         10.9|      51.1|      37.0| 4.5|   rain|
|2012-01-03|          0.8|      53.1|      45.0| 2.3|   rain|
|2012-01-04|         20.3|      54.0|      42.1| 4.7|   rain|
|2012-01-05|          1.3|      48.0|      37.0| 6.1|   rain|
|2012-01-06|          2.5|      39.9|      36.0| 2.2|   rain|
|2012-01-07|          0.0|      45.0|      37.0| 2.3|   rain|
|2012-01-08|          0.0|      50.0|      37.0| 2.0|    sun|
|2012-01-09|          4.3|      48.9|      41.0| 3.4|   rain|
|2012-01-10|          1.0|      43.0|      33.1| 3.4|   rain|
|2012-01-11|          0.0|      43.0|      30.0| 5.1|    sun|
|2012-01-12|          0.0|      43.0|      28.9| 1.9|    sun|
|2012-01-13|          0.0|      41.0|      27.0| 1.3|    sun|
|2012-01

In [30]:
# Another way to do it
# weather = (weather.withColumn('temp_max_f', expr('round(temp_max * 9 / 5 + 32, 1)'))
#     .withColumn ('temp_min_f', expr('round(temp_min * 9 / 5 + 32, 1)'))
#     .drop('temp_max', 'temp_min')
# )

In [31]:
from pyspark.sql.functions import asc, desc

In [32]:
weather.sort(desc('date')).show()

+----------+-------------+----------+----------+----+-------+
|      date|precipitation|temp_max_f|temp_min_f|wind|weather|
+----------+-------------+----------+----------+----+-------+
|2015-12-31|          0.0|      42.1|      28.2| 3.5|    sun|
|2015-12-30|          0.0|      42.1|      30.2| 3.4|    sun|
|2015-12-29|          0.0|      45.0|      33.1| 2.6|    fog|
|2015-12-28|          1.5|      41.0|      35.1| 1.3|    fog|
|2015-12-27|          8.6|      39.9|      35.1| 2.9|    fog|
|2015-12-26|          0.0|      39.9|      32.0| 2.5|    sun|
|2015-12-25|          5.8|      41.0|      36.0| 1.5|    fog|
|2015-12-24|          2.5|      42.1|      36.0| 4.3|    fog|
|2015-12-23|          6.1|      41.0|      37.0| 7.6|    fog|
|2015-12-22|          4.6|      46.0|      37.0| 5.0|    fog|
|2015-12-21|         27.4|      42.1|      37.0| 4.3|    fog|
|2015-12-20|          4.3|      46.0|      39.9| 6.7|    fog|
|2015-12-19|          0.0|      46.9|      37.0| 4.1|    fog|
|2015-12

In [33]:
# Next question - display average monthly rainfall over the 4 years in the data set

In [34]:
# Following cells are how I can answer the question with a pandas dataframe
# Goal - display average monthly rainfall over the 4 years in the data set
wx = data.seattle_weather()
wx['year'] = wx.date.dt.year
wx['month'] = wx.date.dt.month
wx.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,year,month
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,2012,1
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2012,1
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2012,1
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2012,1
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2012,1


In [35]:
wx.groupby(by=['year','month']).precipitation.sum().groupby('month').mean

<bound method GroupBy.mean of <pandas.core.groupby.generic.SeriesGroupBy object at 0x10ca2b7c0>>

In [36]:
# NOW, how can I do it with a spark dataframe
weather.show(5)

+----------+-------------+----------+----------+----+-------+
|      date|precipitation|temp_max_f|temp_min_f|wind|weather|
+----------+-------------+----------+----------+----+-------+
|2012-01-01|          0.0|      55.0|      41.0| 4.7|drizzle|
|2012-01-02|         10.9|      51.1|      37.0| 4.5|   rain|
|2012-01-03|          0.8|      53.1|      45.0| 2.3|   rain|
|2012-01-04|         20.3|      54.0|      42.1| 4.7|   rain|
|2012-01-05|          1.3|      48.0|      37.0| 6.1|   rain|
+----------+-------------+----------+----------+----+-------+
only showing top 5 rows



In [37]:
from pyspark.sql.functions import month, year, quarter, sum

In [38]:
# Whoa. This worked. Let's see if I can round it
(
    weather.withColumn('year', year('date'))
    .withColumn('month', month('date'))
    .groupby('year', 'month').agg(sum('precipitation').alias('avg_month_precip'))
    .groupby('month').agg(avg('avg_month_precip').alias('avg_month_precip'))
    .sort('month').show()
)    

+-----+------------------+
|month|  avg_month_precip|
+-----+------------------+
|    1|116.49999999999999|
|    2|             105.5|
|    3|            151.55|
|    4|             93.85|
|    5|            51.875|
|    6|            33.225|
|    7|             12.05|
|    8|            40.925|
|    9| 58.87499999999999|
|   10|            125.85|
|   11|           160.625|
|   12|           155.675|
+-----+------------------+



In [39]:
# Whoa again. Lots of iteration to get to this.  But success!
# monthly average rainfalls are listed, rounded to the nearest decimal
(
    weather.withColumn('year', year('date'))
    .withColumn('month', month('date'))
    .groupby('year', 'month').agg(sum('precipitation').alias('avg_month_precip'))
    .groupby('month').agg(avg('avg_month_precip').alias('avg_month_precip'))
    .sort('month')
).select('month', round('avg_month_precip', 1).alias('avg_month_precip')).show()

+-----+----------------+
|month|avg_month_precip|
+-----+----------------+
|    1|           116.5|
|    2|           105.5|
|    3|           151.6|
|    4|            93.9|
|    5|            51.9|
|    6|            33.2|
|    7|            12.1|
|    8|            40.9|
|    9|            58.9|
|   10|           125.9|
|   11|           160.6|
|   12|           155.7|
+-----+----------------+



In [40]:
# thought I could do it quicker, but you need the groupby year in there to get the average monthly rainfall
# if you don't, you'll get the average daily rainfall for each month (which is what this is)
(
    weather.withColumn('month', month('date'))
    .groupby('month').agg(avg('precipitation').alias('avg_month_precip'))
    .sort('month')
).show()

+-----+-------------------+
|month|   avg_month_precip|
+-----+-------------------+
|    1| 3.7580645161290316|
|    2|  3.734513274336283|
|    3|  4.888709677419355|
|    4|  3.128333333333333|
|    5| 1.6733870967741935|
|    6| 1.1075000000000002|
|    7|0.38870967741935486|
|    8| 1.3201612903225806|
|    9| 1.9624999999999997|
|   10|  4.059677419354839|
|   11|  5.354166666666667|
|   12|  5.021774193548389|
+-----+-------------------+



In [41]:
# Which year was the windiest (highest average wind)
(
    weather.withColumn('year', year('date'))
    .groupby('year').agg(avg('wind').alias('avg_yearly_wind'))
    .select('year', round('avg_yearly_wind',2).alias('avg_yearly_wind'))
    .sort(desc('avg_yearly_wind'))
    .show()
)

+----+---------------+
|year|avg_yearly_wind|
+----+---------------+
|2012|            3.4|
|2014|           3.39|
|2015|           3.16|
|2013|           3.02|
+----+---------------+



In [42]:
# what is the most frequenty type of weather in January? --- fog
(
    weather.withColumn('month', month('date'))
    .where('month = 1')
    .groupby('weather')
    .count()
    .sort(desc('count'))
    .show()
)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [43]:
# what is the average high and low temps on sunny days in July in 2013 and 2014
(
    weather.withColumn('year', year('date'))
    .withColumn('month', month('date'))
    .where('month = 7')
    .where('weather = "sun"')
    .groupby('year', 'month')
    .agg(avg('temp_max_f').alias('avg_max'), avg('temp_min_f').alias('avg_min'))

).select('year', 'month', round('avg_max', 1).alias('avg_max_temp_sun_days'), round('avg_min', 1).alias('avg_min_temp_sun_days')).where('year > 2012').where('year < 2015').show()

+----+-----+---------------------+---------------------+
|year|month|avg_max_temp_sun_days|avg_min_temp_sun_days|
+----+-----+---------------------+---------------------+
|2013|    7|                 79.9|                 57.1|
|2014|    7|                 80.8|                 57.9|
+----+-----+---------------------+---------------------+



In [44]:
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

In [45]:
# what percentage of days were rainy in q3 of 2015?
# ... having trouble getting percentages
(
    weather.withColumn('year', year('date'))
    .withColumn('month', month('date'))
    .where('year = 2015')
    .where('month > 6')
    .where('month < 10')
    .groupby('weather')
    .count()

).show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   21|
|drizzle|    5|
|   rain|    2|
|    sun|   64|
+-------+-----+



In [46]:
# look at this! I can boolean mask!
weather[weather.weather == 'drizzle'].show(3)

+----------+-------------+----------+----------+----+-------+
|      date|precipitation|temp_max_f|temp_min_f|wind|weather|
+----------+-------------+----------+----------+----+-------+
|2012-01-01|          0.0|      55.0|      41.0| 4.7|drizzle|
|2012-01-27|          0.0|      44.1|      28.0| 1.4|drizzle|
|2012-02-15|          0.0|      45.0|      33.1| 1.8|drizzle|
+----------+-------------+----------+----------+----+-------+
only showing top 3 rows



In [47]:
# add year and date to weather df
weather = (
    weather.withColumn('year', year('date'))
    .withColumn('month', month('date'))
)
weather.show(3)

+----------+-------------+----------+----------+----+-------+----+-----+
|      date|precipitation|temp_max_f|temp_min_f|wind|weather|year|month|
+----------+-------------+----------+----------+----+-------+----+-----+
|2012-01-01|          0.0|      55.0|      41.0| 4.7|drizzle|2012|    1|
|2012-01-02|         10.9|      51.1|      37.0| 4.5|   rain|2012|    1|
|2012-01-03|          0.8|      53.1|      45.0| 2.3|   rain|2012|    1|
+----------+-------------+----------+----------+----+-------+----+-----+
only showing top 3 rows



In [48]:
# get a temp df with only the year and quarter in question
temp = weather[(weather.year == 2015) & (weather.month > 6) & (weather.month < 10)].groupby('weather').count()
temp.show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   21|
|drizzle|    5|
|   rain|    2|
|    sun|   64|
+-------+-----+



In [49]:
# i want to add a percentage column to temp above, but I need to get the total count somehow
total = temp.select(sum(temp['count']).alias('sum_value'))
total.collect()[0][0]

92

In [50]:
# or just like this
temp.select(sum(temp['count'])).collect()[0][0]

92

In [51]:
# use above code to create the percent column
temp.withColumn('percentage', round((temp['count'] / temp.select(sum(temp['count'])).collect()[0][0] * 100),1)).show()

+-------+-----+----------+
|weather|count|percentage|
+-------+-----+----------+
|    fog|   21|      22.8|
|drizzle|    5|       5.4|
|   rain|    2|       2.2|
|    sun|   64|      69.6|
+-------+-----+----------+



In [52]:
# LAST QUESTION!
# For each year, find what percentage of days it rained (had non-zero precipitation).
weather.show(5)

+----------+-------------+----------+----------+----+-------+----+-----+
|      date|precipitation|temp_max_f|temp_min_f|wind|weather|year|month|
+----------+-------------+----------+----------+----+-------+----+-----+
|2012-01-01|          0.0|      55.0|      41.0| 4.7|drizzle|2012|    1|
|2012-01-02|         10.9|      51.1|      37.0| 4.5|   rain|2012|    1|
|2012-01-03|          0.8|      53.1|      45.0| 2.3|   rain|2012|    1|
|2012-01-04|         20.3|      54.0|      42.1| 4.7|   rain|2012|    1|
|2012-01-05|          1.3|      48.0|      37.0| 6.1|   rain|2012|    1|
+----------+-------------+----------+----------+----+-------+----+-----+
only showing top 5 rows



In [53]:
# just getting a feel for the number of days with rain
weather[weather.precipitation == 0].count()

838

In [54]:
weather.count()

1461

In [55]:
# this is neat, but ultimately not as helpful as I hoped
weather.groupby('month').pivot('year').sum('precipitation').sort('month').show()

+-----+------------------+------------------+------------------+------------------+
|month|              2012|              2013|              2014|              2015|
+-----+------------------+------------------+------------------+------------------+
|    1|173.29999999999998|105.69999999999997| 93.99999999999999| 92.99999999999999|
|    2|              92.3|40.300000000000004|155.20000000000002|134.19999999999996|
|    3|             183.0|              69.7|240.00000000000003|113.49999999999997|
|    4| 68.09999999999998|149.60000000000002|106.10000000000001| 51.59999999999999|
|    5|52.199999999999996| 60.49999999999999| 79.99999999999999|14.799999999999999|
|    6|              75.1|              33.1|18.800000000000004|5.8999999999999995|
|    7|              26.3|               0.0|              19.6|               2.3|
|    8|               0.0|              34.4| 45.99999999999999|              83.3|
|    9|0.8999999999999999|156.79999999999998|56.699999999999996|            

In [56]:
# Now, we're talking.  used cast to cast a boolean to an int and later sum it and count it to help calculate percentage
temp = (
    weather.withColumn('is_rain_day', (weather.precipitation == 0).cast('int'))
    .groupby('year')
    .agg(sum('is_rain_day').alias('rainy_days'), count('is_rain_day').alias('total_days'))
    .withColumn('percent_rainy', expr('round(rainy_days / total_days * 100,1)'))
    
)
temp.show()

+----+----------+----------+-------------+
|year|rainy_days|total_days|percent_rainy|
+----+----------+----------+-------------+
|2012|       189|       366|         51.6|
|2013|       213|       365|         58.4|
|2014|       215|       365|         58.9|
|2015|       221|       365|         60.5|
+----+----------+----------+-------------+

