# Caching Data

# 1 Load Data

In [1]:
storageLocation = "s3://dimajix-training/data/weather"

## 1.1 Load Measurements

In [2]:
from pyspark.sql.functions import *
from functools import reduce

# Read in all years, store them in an Python array
raw_weather_per_year = [spark.read.text(storageLocation + "/" + str(i)).withColumn("year", lit(i)) for i in range(2003,2015)]

# Union all years together
raw_weather = reduce(lambda l,r: l.union(r), raw_weather_per_year)                        

Use a single year to keep execution plans small

In [3]:
raw_weather = spark.read.text(storageLocation + "/2003").withColumn("year", lit(2003))

In [4]:
weather = raw_weather.select(
    col("year"),
    substring(col("value"),5,6).alias("usaf"),
    substring(col("value"),11,5).alias("wban"),
    substring(col("value"),16,8).alias("date"),
    substring(col("value"),24,4).alias("time"),
    substring(col("value"),42,5).alias("report_type"),
    substring(col("value"),61,3).alias("wind_direction"),
    substring(col("value"),64,1).alias("wind_direction_qual"),
    substring(col("value"),65,1).alias("wind_observation"),
    (substring(col("value"),66,4).cast("float") / lit(10.0)).alias("wind_speed"),
    substring(col("value"),70,1).alias("wind_speed_qual"),
    (substring(col("value"),88,5).cast("float") / lit(10.0)).alias("air_temperature"),
    substring(col("value"),93,1).alias("air_temperature_qual")
)
    
weather.limit(10).toPandas()

Unnamed: 0,year,usaf,wban,date,time,report_type,wind_direction,wind_direction_qual,wind_observation,wind_speed,wind_speed_qual,air_temperature,air_temperature_qual
0,2003,703160,25624,20030101,0,SY-MT,10,5,N,5.2,5,-0.6,5
1,2003,703160,25624,20030101,17,FM-16,20,1,N,4.6,1,-2.0,1
2,2003,703160,25624,20030101,53,FM-15,10,5,N,5.2,5,-2.8,5
3,2003,703160,25624,20030101,100,NSRDB,999,9,9,999.9,9,999.9,9
4,2003,703160,25624,20030101,153,FM-15,10,5,N,6.2,5,-2.2,5
5,2003,703160,25624,20030101,200,NSRDB,999,9,9,999.9,9,999.9,9
6,2003,703160,25624,20030101,253,FM-15,10,5,N,7.2,5,-3.3,5
7,2003,703160,25624,20030101,300,NSRDB,999,9,9,999.9,9,999.9,9
8,2003,703160,25624,20030101,353,FM-15,20,5,N,6.2,5,-1.1,5
9,2003,703160,25624,20030101,400,NSRDB,999,9,9,999.9,9,999.9,9


## 1.2 Load Station Metadata

In [5]:
stations = spark.read \
    .option("header", True) \
    .csv(storageLocation + "/isd-history")

# Display first 10 records    
stations.limit(10).toPandas()

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,7005,99999,CWOS 07005,,,,,,,20120127,20120127
1,7011,99999,CWOS 07011,,,,,,,20111025,20121129
2,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
3,7025,99999,CWOS 07025,,,,,,,20120127,20120127
4,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20141120
5,7034,99999,CWOS 07034,,,,,,,20121024,20121106
6,7037,99999,CWOS 07037,,,,,,,20111202,20121125
7,7044,99999,CWOS 07044,,,,,,,20120127,20120127
8,7047,99999,CWOS 07047,,,,,,,20120613,20120717
9,7052,99999,CWOS 07052,,,,,,,20121129,20121130


# 2 Caching Data

## 2.1 Original Execution Plan

In [6]:
df = weather.join(stations, (weather.usaf == stations.USAF) & (weather.wban == stations.WBAN))
result = df.groupBy(df.CTRY, df.year).agg(
        min(when(df.air_temperature_qual == lit(1), df.air_temperature)).alias('min_temp'),
        max(when(df.air_temperature_qual == lit(1), df.air_temperature)).alias('max_temp')
    )
result.explain(True)

== Parsed Logical Plan ==
'Aggregate [CTRY#127, year#84], [CTRY#127, year#84, min(CASE WHEN (air_temperature_qual#98 = 1) THEN air_temperature#97 END) AS min_temp#255, max(CASE WHEN (air_temperature_qual#98 = 1) THEN air_temperature#97 END) AS max_temp#257]
+- AnalysisBarrier
      +- Join Inner, ((usaf#87 = USAF#124) && (wban#88 = WBAN#125))
         :- Project [year#84, substring(value#82, 5, 6) AS usaf#87, substring(value#82, 11, 5) AS wban#88, substring(value#82, 16, 8) AS date#89, substring(value#82, 24, 4) AS time#90, substring(value#82, 42, 5) AS report_type#91, substring(value#82, 61, 3) AS wind_direction#92, substring(value#82, 64, 1) AS wind_direction_qual#93, substring(value#82, 65, 1) AS wind_observation#94, (cast(cast(substring(value#82, 66, 4) as float) as double) / cast(10.0 as double)) AS wind_speed#95, substring(value#82, 70, 1) AS wind_speed_qual#96, (cast(cast(substring(value#82, 88, 5) as float) as double) / cast(10.0 as double)) AS air_temperature#97, substring(val

## 2.2 Caching Weather

In [25]:
weather.cache()

DataFrame[year: int, usaf: string, wban: string, date: string, time: string, report_type: string, wind_direction: string, wind_direction_qual: string, wind_observation: string, wind_speed: double, wind_speed_qual: string, air_temperature: double, air_temperature_qual: string]

In [26]:
weather.count()

1798753

In [13]:
weather.explain(True)

== Parsed Logical Plan ==
'Project [unresolvedalias('year, None), substring('value, 5, 6) AS usaf#87, substring('value, 11, 5) AS wban#88, substring('value, 16, 8) AS date#89, substring('value, 24, 4) AS time#90, substring('value, 42, 5) AS report_type#91, substring('value, 61, 3) AS wind_direction#92, substring('value, 64, 1) AS wind_direction_qual#93, substring('value, 65, 1) AS wind_observation#94, (cast(substring('value, 66, 4) as float) / 10.0) AS wind_speed#95, substring('value, 70, 1) AS wind_speed_qual#96, (cast(substring('value, 88, 5) as float) / 10.0) AS air_temperature#97, substring('value, 93, 1) AS air_temperature_qual#98]
+- AnalysisBarrier
      +- Project [value#82, 2003 AS year#84]
         +- Relation[value#82] text

== Analyzed Logical Plan ==
year: int, usaf: string, wban: string, date: string, time: string, report_type: string, wind_direction: string, wind_direction_qual: string, wind_observation: string, wind_speed: double, wind_speed_qual: string, air_temperatur

Note the InMemoryRelation! Caching always is a two-step operation
* Creating the cache (InMemoryRelation)
* Using the cache (InMemoryTableScan)

In [8]:
result = df.groupBy(df.CTRY, df.year).agg(
        min(when(df.air_temperature_qual == lit(1), df.air_temperature)).alias('min_temp'),
        max(when(df.air_temperature_qual == lit(1), df.air_temperature)).alias('max_temp')
    )
result.explain(True)

== Parsed Logical Plan ==
'Aggregate [CTRY#127, year#84], [CTRY#127, year#84, min(CASE WHEN (air_temperature_qual#98 = 1) THEN air_temperature#97 END) AS min_temp#358, max(CASE WHEN (air_temperature_qual#98 = 1) THEN air_temperature#97 END) AS max_temp#360]
+- AnalysisBarrier
      +- Join Inner, ((usaf#87 = USAF#124) && (wban#88 = WBAN#125))
         :- Project [year#84, substring(value#82, 5, 6) AS usaf#87, substring(value#82, 11, 5) AS wban#88, substring(value#82, 16, 8) AS date#89, substring(value#82, 24, 4) AS time#90, substring(value#82, 42, 5) AS report_type#91, substring(value#82, 61, 3) AS wind_direction#92, substring(value#82, 64, 1) AS wind_direction_qual#93, substring(value#82, 65, 1) AS wind_observation#94, (cast(cast(substring(value#82, 66, 4) as float) as double) / cast(10.0 as double)) AS wind_speed#95, substring(value#82, 70, 1) AS wind_speed_qual#96, (cast(cast(substring(value#82, 88, 5) as float) as double) / cast(10.0 as double)) AS air_temperature#97, substring(val

Things to note:
* Cache always consists of two steps:
  * InMemoryRelation (building the cache)
  * InMemoryTableScan (reading the cache)
* Cache contains ALL columns of weather
* Filter operation of JOIN is performed after caching

Caching is an optimization barrier!

## 2.2 Uncaching Data

In [27]:
weather.unpersist(True)

DataFrame[year: int, usaf: string, wban: string, date: string, time: string, report_type: string, wind_direction: string, wind_direction_qual: string, wind_observation: string, wind_speed: double, wind_speed_qual: string, air_temperature: double, air_temperature_qual: string]

In [28]:
weather.explain(True)

== Parsed Logical Plan ==
'Project [unresolvedalias('year, None), substring('value, 5, 6) AS usaf#87, substring('value, 11, 5) AS wban#88, substring('value, 16, 8) AS date#89, substring('value, 24, 4) AS time#90, substring('value, 42, 5) AS report_type#91, substring('value, 61, 3) AS wind_direction#92, substring('value, 64, 1) AS wind_direction_qual#93, substring('value, 65, 1) AS wind_observation#94, (cast(substring('value, 66, 4) as float) / 10.0) AS wind_speed#95, substring('value, 70, 1) AS wind_speed_qual#96, (cast(substring('value, 88, 5) as float) / 10.0) AS air_temperature#97, substring('value, 93, 1) AS air_temperature_qual#98]
+- AnalysisBarrier
      +- Project [value#82, 2003 AS year#84]
         +- Relation[value#82] text

== Analyzed Logical Plan ==
year: int, usaf: string, wban: string, date: string, time: string, report_type: string, wind_direction: string, wind_direction_qual: string, wind_observation: string, wind_speed: double, wind_speed_qual: string, air_temperatur

Strange, the execution plan still shows an InMemory relation. But checking the UI shows that storage has been freed up

In [29]:
weather.count()

1798753

This also didn't recreate the storage. This seems to be a bug in PySpark.

# 3 Cache Levels

In [10]:
from pyspark.storagelevel import StorageLevel

weather.persist(StorageLevel.MEMORY_ONLY)
weather.persist(StorageLevel.MEMORY_ONLY_SER)
weather.persist(StorageLevel.DISK_ONLY)
weather.persist(StorageLevel.MEMORY_AND_DISK)

weather.persist(StorageLevel.MEMORY_ONLY_2)
weather.persist(StorageLevel.MEMORY_ONLY_SER_2)
weather.persist(StorageLevel.DISK_ONLY_2)
weather.persist(StorageLevel.MEMORY_AND_DISK_2)


DataFrame[year: int, usaf: string, wban: string, date: string, time: string, report_type: string, wind_direction: string, wind_direction_qual: string, wind_observation: string, wind_speed: double, wind_speed_qual: string, air_temperature: double, air_temperature_qual: string]