In [1]:
import pyspark
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
from pydataset import data

In [2]:
# Define the data
data = {
    'Languages': [
        'Python',
        'Java',
        'JavaScript',
        'C',
        'C++',
        'C#',
        'R',
        'Go',
        'Swift',
        'PHP',
        'Ruby',
        'Kotlin',
        'TypeScript',
        'Scala',
        'Rust',
        'Lua',
        'Perl',
        'SQL',
        'Bash/Shell/PowerShell',
        'Matlab',
    ],
    'First Appeared': [
        1991,
        1995,
        1995,
        1972,
        1985,
        2000,
        1993,
        2007,
        2014,
        1995,
        1995,
        2011,
        2012,
        2003,
        2010,
        1993,
        1987,
        1974,
        1989,
        1984,
    ],
}

# Create the DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

                Languages  First Appeared
0                  Python            1991
1                    Java            1995
2              JavaScript            1995
3                       C            1972
4                     C++            1985
5                      C#            2000
6                       R            1993
7                      Go            2007
8                   Swift            2014
9                     PHP            1995
10                   Ruby            1995
11                 Kotlin            2011
12             TypeScript            2012
13                  Scala            2003
14                   Rust            2010
15                    Lua            1993
16                   Perl            1987
17                    SQL            1974
18  Bash/Shell/PowerShell            1989
19                 Matlab            1984


Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language

In [3]:
#  initialize a session
spark = pyspark.sql.SparkSession.builder.getOrCreate()

23/05/22 12:09:44 WARN Utils: Your hostname, Gabriels-iMac.local resolves to a loopback address: 127.0.0.1; using 172.20.10.10 instead (on interface en1)
23/05/22 12:09:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/22 12:09:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/22 12:09:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# using the spark session, turn our tabular data into a spark df
langs = spark.createDataFrame(df)

In [5]:
langs = langs.withColumnRenamed('languages', 'language')

- View the schema of the dataframe

In [6]:
langs.printSchema()

root
 |-- language: string (nullable = true)
 |-- First Appeared: long (nullable = true)



- Output the shape of the dataframe

In [7]:
print(f'{langs.count()} rows, {len(langs.columns)} columns')

[Stage 0:>                                                          (0 + 4) / 4]

20 rows, 2 columns


                                                                                

- Show the first 5 records in the dataframe

In [8]:
langs.show(5)

+----------+--------------+
|  language|First Appeared|
+----------+--------------+
|    Python|          1991|
|      Java|          1995|
|JavaScript|          1995|
|         C|          1972|
|       C++|          1985|
+----------+--------------+
only showing top 5 rows



Load the mpg dataset as a spark dataframe.

In [9]:
mpg = spark.createDataFrame(data('mpg'))

TypeError: 'dict' object is not callable

Create 1 column of output that contains a message like the one below:


The 1999 audi a4 has a 4 cylinder engine.
For each vehicle.

In [None]:
mpg.select(
    (F.concat(
        F.lit('The '),
        mpg.year,
        F.lit(' '),
        mpg.manufacturer,
        F.lit(' '),
        mpg.model,
        F.lit(' has a '),
        mpg.cyl,
        F.lit(' cylinder engine')
    )).alias('statement')).show(5, truncate=False)

Transform the trans column so that it only contains either manual or auto.

In [None]:
mpg.select(
    (
        F.when(
        mpg.trans.contains('auto'), 'auto'
    ).otherwise('manual')
    ).alias('trans_type')
).show()

Load the tips dataset as a spark dataframe.

In [None]:
# make the variable tips,
# use the spark session to create a tips dataframe
# tips dataframe comes from pydataset tips function
tips = spark.createDataFrame(data('tips'))

- What percentage of observations are smokers?

In [None]:
tips.columns

In [None]:
tips.show(5)

In [None]:
(tips.filter(tips.smoker == 'Yes').count() / tips.count()) * 100

In [None]:
tips.groupby(tips.smoker).agg(
    F.round((F.count('time') / tips.count()),2).alias('perc')
).show()

- Create a column that contains the tip percentage

In [None]:
tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
).select('tip','total_bill','tip_perc').show(5)

- Calculate the average tip percentage for each combination of sex and smoker.

In [None]:
tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
).groupby('sex','smoker').agg(
    F.round(F.mean('tip_perc'), 2).alias('avg_perc')).show()

In [None]:
tips = tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
)

In [None]:
tips.withColumn(
    'tip_perc',
    F.round((tips.tip / tips.total_bill) * 100, 2)
).groupby('sex').pivot('smoker').agg(
    F.round(F.avg('tip_perc'), 2).alias('avg_perc')).show()

Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [11]:
from vega_datasets import data
weather = data.seattle_weather()
weather = spark.createDataFrame(weather)

In [12]:
# head equivalent:
weather.show(2, vertical=True)

-RECORD 0----------------------------
 date          | 2012-01-01 00:00:00 
 precipitation | 0.0                 
 temp_max      | 12.8                
 temp_min      | 5.0                 
 wind          | 4.7                 
 weather       | drizzle             
-RECORD 1----------------------------
 date          | 2012-01-02 00:00:00 
 precipitation | 10.9                
 temp_max      | 10.6                
 temp_min      | 2.8                 
 wind          | 4.5                 
 weather       | rain                
only showing top 2 rows



- Convert the temperatures to fahrenheit.

In [13]:
weather = weather.withColumn('temp_min', 
                   weather.temp_min * (9/5) + 32
                  ).withColumn(
    'temp_max', 
    weather.temp_max * (9/5) + 32
                  )

In [14]:
weather.select('temp_min', 'temp_max').show(5)

+--------+------------------+
|temp_min|          temp_max|
+--------+------------------+
|    41.0|55.040000000000006|
|   37.04|             51.08|
|   44.96|             53.06|
|   42.08|             53.96|
|   37.04|48.019999999999996|
+--------+------------------+
only showing top 5 rows



- Which month has the most rain, on average?

In [15]:
weather.filter(weather.weather == 'rain'
).\
withColumn(
    'month',
    F.month(weather.date)
).groupby('month').agg(
    (F.mean('precipitation')).alias('mean_precip'),
    (F.sum('precipitation')).alias('total_precip')
    ).sort('total_precip').show()

+-----+-------------------+------------------+
|month|        mean_precip|      total_precip|
+-----+-------------------+------------------+
|    9|0.22499999999999998|0.8999999999999999|
|    7| 1.8785714285714286|              26.3|
|    8|  6.433333333333334|              38.6|
|    5| 3.2624999999999997|52.199999999999996|
|    4|  3.429999999999999| 68.59999999999998|
|    6|  3.952631578947368|              75.1|
|   12| 5.0260869565217385|115.59999999999998|
|    2| 3.1725000000000003|             126.9|
|    3|  4.921621621621622|             182.1|
|   10|              9.675|             193.5|
|   11|               8.42|             210.5|
|    1|  6.614285714285714|231.49999999999997|
+-----+-------------------+------------------+



- Which year was the windiest?

In [19]:
weather.withColumn(
    'year',
    F.year(weather.date)
).groupby(
'year'
).agg(
 (F.mean('wind')).alias('mean_wind'),
    (F.sum('wind')).alias('total_wind')   
).sort(F.col('total_wind').desc()).first()

Row(year=2012, mean_wind=3.400819672131147, total_wind=1244.6999999999998)

- What is the most frequent type of weather in January?

In [20]:
weather.filter(
    F.month(weather.date) == 1
).groupby('weather').count().sort(F.col('count').desc()).show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



- What is the average high and low temperature on sunny days in July in 2013 and 2014?

In [16]:
# steps ahead:
# filter for: july (month)
# year (2013, 2014)
# weather: sunny
# aggregate on high and low temp averages
weather.filter(
    (F.month(weather.date) == 7) &
    (F.year(weather.date).isin(['2013','2014'])) &
    (weather.weather == 'sun')
).agg(
    F.round(F.mean('temp_max'),2).alias('avg_high'), 
      F.round(F.mean('temp_min'), 2).alias('avg_low')
).show()

+--------+-------+
|avg_high|avg_low|
+--------+-------+
|   80.29|  57.53|
+--------+-------+



- What percentage of days were rainy in q3 of 2015?

In [17]:
# filter for q3:
# filter for year
# calculate percentage of rainy days
weather.filter(
    F.quarter(weather.date) == 3).filter(
    F.year(weather.date) == 2015
).select(
    (F.when(weather.weather == 'rain', 1).otherwise(0)).alias('rain')
).agg((F.round(F.mean('rain') * 100, 2)).alias('perc_rain')).show()

+---------+
|perc_rain|
+---------+
|     2.17|
+---------+



- For each year, find what percentage of days it rained (had non-zero precipitation).

In [18]:
# nonzero precipitation
weather.withColumn(
    'year',
    F.year(weather.date)).withColumn(
    'has_rained',
    F.expr('precipitation > 0').cast('int')
).groupby('year').agg(
    F.round(F.avg('has_rained') * 100, 2).alias('percent_precip')
).show()
    

+----+--------------+
|year|percent_precip|
+----+--------------+
|2012|         48.36|
|2013|         41.64|
|2014|          41.1|
|2015|         39.45|
+----+--------------+

