# Checkpointing DataFrames

# 1 Load Data

In [1]:
storageLocation = "s3://dimajix-training/data/weather"

## 1.1 Load Measurements

In [2]:
from pyspark.sql.functions import *
from functools import reduce

# Read in all years, store them in an Python array
raw_weather_per_year = [spark.read.text(storageLocation + "/" + str(i)).withColumn("year", lit(i)) for i in range(2003,2015)]

# Union all years together
raw_weather = reduce(lambda l,r: l.union(r), raw_weather_per_year)                        

Use a single year to keep execution plans small

In [3]:
raw_weather = spark.read.text(storageLocation + "/2003").withColumn("year", lit(2003))

In [4]:
weather = raw_weather.select(
    col("year"),
    substring(col("value"),5,6).alias("usaf"),
    substring(col("value"),11,5).alias("wban"),
    substring(col("value"),16,8).alias("date"),
    substring(col("value"),24,4).alias("time"),
    substring(col("value"),42,5).alias("report_type"),
    substring(col("value"),61,3).alias("wind_direction"),
    substring(col("value"),64,1).alias("wind_direction_qual"),
    substring(col("value"),65,1).alias("wind_observation"),
    (substring(col("value"),66,4).cast("float") / lit(10.0)).alias("wind_speed"),
    substring(col("value"),70,1).alias("wind_speed_qual"),
    (substring(col("value"),88,5).cast("float") / lit(10.0)).alias("air_temperature"),
    substring(col("value"),93,1).alias("air_temperature_qual")
)
    
weather.limit(10).toPandas()

Unnamed: 0,year,usaf,wban,date,time,report_type,wind_direction,wind_direction_qual,wind_observation,wind_speed,wind_speed_qual,air_temperature,air_temperature_qual
0,2003,703160,25624,20030101,0,SY-MT,10,5,N,5.2,5,-0.6,5
1,2003,703160,25624,20030101,17,FM-16,20,1,N,4.6,1,-2.0,1
2,2003,703160,25624,20030101,53,FM-15,10,5,N,5.2,5,-2.8,5
3,2003,703160,25624,20030101,100,NSRDB,999,9,9,999.9,9,999.9,9
4,2003,703160,25624,20030101,153,FM-15,10,5,N,6.2,5,-2.2,5
5,2003,703160,25624,20030101,200,NSRDB,999,9,9,999.9,9,999.9,9
6,2003,703160,25624,20030101,253,FM-15,10,5,N,7.2,5,-3.3,5
7,2003,703160,25624,20030101,300,NSRDB,999,9,9,999.9,9,999.9,9
8,2003,703160,25624,20030101,353,FM-15,20,5,N,6.2,5,-1.1,5
9,2003,703160,25624,20030101,400,NSRDB,999,9,9,999.9,9,999.9,9


## 1.2 Load Station Metadata

In [5]:
stations = spark.read \
    .option("header", True) \
    .csv(storageLocation + "/isd-history")

# Display first 10 records    
stations.limit(10).toPandas()

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,7005,99999,CWOS 07005,,,,,,,20120127,20120127
1,7011,99999,CWOS 07011,,,,,,,20111025,20121129
2,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
3,7025,99999,CWOS 07025,,,,,,,20120127,20120127
4,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20141120
5,7034,99999,CWOS 07034,,,,,,,20121024,20121106
6,7037,99999,CWOS 07037,,,,,,,20111202,20121125
7,7044,99999,CWOS 07044,,,,,,,20120127,20120127
8,7047,99999,CWOS 07047,,,,,,,20120613,20120717
9,7052,99999,CWOS 07052,,,,,,,20121129,20121130


# 3 Truncating Execution Plans

In [13]:
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")

In [20]:
weather.checkpoint()

DataFrame[year: int, usaf: string, wban: string, date: string, time: string, report_type: string, wind_direction: string, wind_direction_qual: string, wind_observation: string, wind_speed: double, wind_speed_qual: string, air_temperature: double, air_temperature_qual: string]

In [21]:
weather.explain()

== Physical Plan ==
*(1) Project [2003 AS year#297, substring(value#295, 5, 6) AS usaf#823, substring(value#295, 11, 5) AS wban#824, substring(value#295, 16, 8) AS date#825, substring(value#295, 24, 4) AS time#826, substring(value#295, 42, 5) AS report_type#827, substring(value#295, 61, 3) AS wind_direction#828, substring(value#295, 64, 1) AS wind_direction_qual#829, substring(value#295, 65, 1) AS wind_observation#830, (cast(cast(substring(value#295, 66, 4) as float) as double) / 10.0) AS wind_speed#831, substring(value#295, 70, 1) AS wind_speed_qual#832, (cast(cast(substring(value#295, 88, 5) as float) as double) / 10.0) AS air_temperature#833, substring(value#295, 93, 1) AS air_temperature_qual#834]
+- *(1) FileScan text [value#295] Batched: false, Format: Text, Location: InMemoryFileIndex[s3://dimajix-training/data/weather/2003], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>


In [16]:
weather.unpersist()

DataFrame[year: int, usaf: string, wban: string, date: string, time: string, report_type: string, wind_direction: string, wind_direction_qual: string, wind_observation: string, wind_speed: double, wind_speed_qual: string, air_temperature: double, air_temperature_qual: string]

In [24]:
weather.select("usaf").explain(True)

== Parsed Logical Plan ==
'Project [unresolvedalias('usaf, None)]
+- AnalysisBarrier
      +- Project [year#297, substring(value#295, 5, 6) AS usaf#823, substring(value#295, 11, 5) AS wban#824, substring(value#295, 16, 8) AS date#825, substring(value#295, 24, 4) AS time#826, substring(value#295, 42, 5) AS report_type#827, substring(value#295, 61, 3) AS wind_direction#828, substring(value#295, 64, 1) AS wind_direction_qual#829, substring(value#295, 65, 1) AS wind_observation#830, (cast(cast(substring(value#295, 66, 4) as float) as double) / cast(10.0 as double)) AS wind_speed#831, substring(value#295, 70, 1) AS wind_speed_qual#832, (cast(cast(substring(value#295, 88, 5) as float) as double) / cast(10.0 as double)) AS air_temperature#833, substring(value#295, 93, 1) AS air_temperature_qual#834]
         +- Project [value#295, 2003 AS year#297]
            +- Relation[value#295] text

== Analyzed Logical Plan ==
usaf: string
Project [usaf#823]
+- Project [year#297, substring(value#295, 5,