# Broadcast Variables

# 1 Load Data

In [4]:
storageLocation = "s3://dimajix-training/data/weather"

## 1.1 Load Measurements

In [5]:
from pyspark.sql.functions import *
from functools import reduce

# Read in all years, store them in an Python array
raw_weather_per_year = [spark.read.text(storageLocation + "/" + str(i)).withColumn("year", lit(i)) for i in range(2003,2015)]

# Union all years together
raw_weather = reduce(lambda l,r: l.union(r), raw_weather_per_year)                        

Use a single year to keep execution plans small

In [6]:
raw_weather = spark.read.text(storageLocation + "/2003").withColumn("year", lit(2003))

In [7]:
weather = raw_weather.select(
    col("year"),
    substring(col("value"),5,6).alias("usaf"),
    substring(col("value"),11,5).alias("wban"),
    substring(col("value"),16,8).alias("date"),
    substring(col("value"),24,4).alias("time"),
    substring(col("value"),42,5).alias("report_type"),
    substring(col("value"),61,3).alias("wind_direction"),
    substring(col("value"),64,1).alias("wind_direction_qual"),
    substring(col("value"),65,1).alias("wind_observation"),
    (substring(col("value"),66,4).cast("float") / lit(10.0)).alias("wind_speed"),
    substring(col("value"),70,1).alias("wind_speed_qual"),
    (substring(col("value"),88,5).cast("float") / lit(10.0)).alias("air_temperature"),
    substring(col("value"),93,1).alias("air_temperature_qual")
)

## 1.2 Load Station Metadata

In [8]:
stations = spark.read \
    .option("header", True) \
    .csv(storageLocation + "/isd-history")

We convert the stations DataFrame to a normal Python map, since we want to discuss broadcast variables.

In [20]:
py_stations = stations.select(concat(stations["usaf"], stations["wban"]).alias("key"), stations["ctry"]).collect()
py_stations = {key:value for (key,value) in py_stations}

list(py_stations.items())[0:10]

[('00700599999', None),
 ('00701199999', None),
 ('00701899999', None),
 ('00702599999', None),
 ('00702699999', 'AF'),
 ('00703499999', None),
 ('00703799999', None),
 ('00704499999', None),
 ('00704799999', None),
 ('00705299999', None)]

# 2 Using Broadcast Variables

## 2.1 Create a UDF

In [27]:
def lookup_country(usaf, wban):
    return py_stations.get(usaf + wban)
    
print(lookup_country("007026", "99999"))    
print(lookup_country("123", "456"))

AF
None


## 2.2 Not using a broadcast variable

In [31]:
@udf('string')
def lookup_country(usaf, wban):
    return py_stations.get(usaf + wban)

In [32]:
result = weather.withColumn('country', lookup_country(weather["usaf"], weather["wban"]))
result.limit(10).toPandas()

Unnamed: 0,year,usaf,wban,date,time,report_type,wind_direction,wind_direction_qual,wind_observation,wind_speed,wind_speed_qual,air_temperature,air_temperature_qual,country
0,2003,703160,25624,20030101,0,SY-MT,10,5,N,5.2,5,-0.6,5,US
1,2003,703160,25624,20030101,17,FM-16,20,1,N,4.6,1,-2.0,1,US
2,2003,703160,25624,20030101,53,FM-15,10,5,N,5.2,5,-2.8,5,US
3,2003,703160,25624,20030101,100,NSRDB,999,9,9,999.9,9,999.9,9,US
4,2003,703160,25624,20030101,153,FM-15,10,5,N,6.2,5,-2.2,5,US
5,2003,703160,25624,20030101,200,NSRDB,999,9,9,999.9,9,999.9,9,US
6,2003,703160,25624,20030101,253,FM-15,10,5,N,7.2,5,-3.3,5,US
7,2003,703160,25624,20030101,300,NSRDB,999,9,9,999.9,9,999.9,9,US
8,2003,703160,25624,20030101,353,FM-15,20,5,N,6.2,5,-1.1,5,US
9,2003,703160,25624,20030101,400,NSRDB,999,9,9,999.9,9,999.9,9,US


In [33]:
result.explain()

== Physical Plan ==
*(2) Project [2003 AS year#84, substring(value#82, 5, 6) AS usaf#87, substring(value#82, 11, 5) AS wban#88, substring(value#82, 16, 8) AS date#89, substring(value#82, 24, 4) AS time#90, substring(value#82, 42, 5) AS report_type#91, substring(value#82, 61, 3) AS wind_direction#92, substring(value#82, 64, 1) AS wind_direction_qual#93, substring(value#82, 65, 1) AS wind_observation#94, (cast(cast(substring(value#82, 66, 4) as float) as double) / 10.0) AS wind_speed#95, substring(value#82, 70, 1) AS wind_speed_qual#96, (cast(cast(substring(value#82, 88, 5) as float) as double) / 10.0) AS air_temperature#97, substring(value#82, 93, 1) AS air_temperature_qual#98, pythonUDF0#192 AS country#174]
+- BatchEvalPython [lookup_country(substring(value#82, 5, 6), substring(value#82, 11, 5))], [value#82, pythonUDF0#192]
   +- *(1) FileScan text [value#82] Batched: false, Format: Text, Location: InMemoryFileIndex[s3://dimajix-training/data/weather/2003], PartitionFilters: [], Pushed

## 2.2 Using a Broadcast Variable

In [38]:
bc_stations = spark.sparkContext.broadcast(py_stations)

@udf('string')
def lookup_country(usaf, wban):
    return bc_stations.value.get(usaf + wban)

In [39]:
result = weather.withColumn('country', lookup_country(weather["usaf"], weather["wban"]))
result.limit(10).toPandas()

Unnamed: 0,year,usaf,wban,date,time,report_type,wind_direction,wind_direction_qual,wind_observation,wind_speed,wind_speed_qual,air_temperature,air_temperature_qual,country
0,2003,703160,25624,20030101,0,SY-MT,10,5,N,5.2,5,-0.6,5,US
1,2003,703160,25624,20030101,17,FM-16,20,1,N,4.6,1,-2.0,1,US
2,2003,703160,25624,20030101,53,FM-15,10,5,N,5.2,5,-2.8,5,US
3,2003,703160,25624,20030101,100,NSRDB,999,9,9,999.9,9,999.9,9,US
4,2003,703160,25624,20030101,153,FM-15,10,5,N,6.2,5,-2.2,5,US
5,2003,703160,25624,20030101,200,NSRDB,999,9,9,999.9,9,999.9,9,US
6,2003,703160,25624,20030101,253,FM-15,10,5,N,7.2,5,-3.3,5,US
7,2003,703160,25624,20030101,300,NSRDB,999,9,9,999.9,9,999.9,9,US
8,2003,703160,25624,20030101,353,FM-15,20,5,N,6.2,5,-1.1,5,US
9,2003,703160,25624,20030101,400,NSRDB,999,9,9,999.9,9,999.9,9,US


In [40]:
result.explain()

== Physical Plan ==
*(2) Project [2003 AS year#84, substring(value#82, 5, 6) AS usaf#87, substring(value#82, 11, 5) AS wban#88, substring(value#82, 16, 8) AS date#89, substring(value#82, 24, 4) AS time#90, substring(value#82, 42, 5) AS report_type#91, substring(value#82, 61, 3) AS wind_direction#92, substring(value#82, 64, 1) AS wind_direction_qual#93, substring(value#82, 65, 1) AS wind_observation#94, (cast(cast(substring(value#82, 66, 4) as float) as double) / 10.0) AS wind_speed#95, substring(value#82, 70, 1) AS wind_speed_qual#96, (cast(cast(substring(value#82, 88, 5) as float) as double) / 10.0) AS air_temperature#97, substring(value#82, 93, 1) AS air_temperature_qual#98, pythonUDF0#247 AS country#229]
+- BatchEvalPython [lookup_country(substring(value#82, 5, 6), substring(value#82, 11, 5))], [value#82, pythonUDF0#247]
   +- *(1) FileScan text [value#82] Batched: false, Format: Text, Location: InMemoryFileIndex[s3://dimajix-training/data/weather/2003], PartitionFilters: [], Pushed

## 2.3 Pandas UDFs

In [43]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

@pandas_udf('string', PandasUDFType.SCALAR)
def lookup_country(usaf, wban):
    def lookup(key):
        return bc_stations.value.get(key)
    usaf_wban = usaf + wban
    return usaf_wban.apply(lookup)

In [44]:
result = weather.withColumn('country', lookup_country(weather["usaf"], weather["wban"]))
result.limit(10).toPandas()

Unnamed: 0,year,usaf,wban,date,time,report_type,wind_direction,wind_direction_qual,wind_observation,wind_speed,wind_speed_qual,air_temperature,air_temperature_qual,country
0,2003,703160,25624,20030101,0,SY-MT,10,5,N,5.2,5,-0.6,5,US
1,2003,703160,25624,20030101,17,FM-16,20,1,N,4.6,1,-2.0,1,US
2,2003,703160,25624,20030101,53,FM-15,10,5,N,5.2,5,-2.8,5,US
3,2003,703160,25624,20030101,100,NSRDB,999,9,9,999.9,9,999.9,9,US
4,2003,703160,25624,20030101,153,FM-15,10,5,N,6.2,5,-2.2,5,US
5,2003,703160,25624,20030101,200,NSRDB,999,9,9,999.9,9,999.9,9,US
6,2003,703160,25624,20030101,253,FM-15,10,5,N,7.2,5,-3.3,5,US
7,2003,703160,25624,20030101,300,NSRDB,999,9,9,999.9,9,999.9,9,US
8,2003,703160,25624,20030101,353,FM-15,20,5,N,6.2,5,-1.1,5,US
9,2003,703160,25624,20030101,400,NSRDB,999,9,9,999.9,9,999.9,9,US
