In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

if not 'spark' in locals():
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory","64G") \
        .getOrCreate()

spark

# Get Data from S3

First we load the data source containing raw weather measurements from S3. Since the data doesn't follow any well-known format (like CSV or JSON), we load it as raw text data and extract all required information. 

But first let's load a single year, just to get an impression of the data

In [15]:
storageLocation = "s3://dimajix-training/data/weather"

Read in the year 2003 as `text` using the `spark.read.text` method. The data can be found at `storageLocation + "/2003"` and should be stored in a variable called `weather_raw_2003`. Also using `limit` and `toPandas` retrieve the first 10 rows and display them as a Pandas DataFrame.

In [16]:
raw_weather_2003 = spark.read.text(storageLocation + "/2003")
raw_weather_2003.limit(10).toPandas()

Unnamed: 0,value
0,0494703160256242003010100003+55200-162717SY-MT...
1,0228703160256242003010100174+55200-162730FM-16...
2,044070316025624200301010053C+55200-162717FM-15...
3,0071703160256242003010101009+55200-162717NSRDB...
4,042770316025624200301010153C+55200-162717FM-15...
5,0071703160256242003010102009+55200-162717NSRDB...
6,046870316025624200301010253C+55200-162717FM-15...
7,0071703160256242003010103009+55200-162717NSRDB...
8,041570316025624200301010353C+55200-162717FM-15...
9,0054703160256242003010104009+55200-162717NSRDB...


## Read in all years

Now we read in all years by creating a union. We also add the year as a logical partition column, this will be used later.

In [17]:
from functools import reduce

# Read in all years, store them in an Python array
raw_weather_per_year = [spark.read.text(storageLocation + "/" + str(i)).withColumn("year", lit(i)) for i in range(2003,2020)]

# Union all years together
raw_weather = reduce(lambda l,r: l.union(r), raw_weather_per_year)                        

# Display first 10 records
raw_weather.limit(10).toPandas()

Unnamed: 0,value,year
0,0494703160256242003010100003+55200-162717SY-MT...,2003
1,0228703160256242003010100174+55200-162730FM-16...,2003
2,044070316025624200301010053C+55200-162717FM-15...,2003
3,0071703160256242003010101009+55200-162717NSRDB...,2003
4,042770316025624200301010153C+55200-162717FM-15...,2003
5,0071703160256242003010102009+55200-162717NSRDB...,2003
6,046870316025624200301010253C+55200-162717FM-15...,2003
7,0071703160256242003010103009+55200-162717NSRDB...,2003
8,041570316025624200301010353C+55200-162717FM-15...,2003
9,0054703160256242003010104009+55200-162717NSRDB...,2003


## Extract Information

The raw data is not exactly nice to work with, so we need to extract the relevant information by using appropriate substr operations.

In [18]:
weather = raw_weather.select(
    col("year"),
    substring(col("value"),5,6).alias("usaf"),
    substring(col("value"),11,5).alias("wban"),
    substring(col("value"),16,8).alias("date"),
    substring(col("value"),24,4).alias("time"),
    substring(col("value"),42,5).alias("report_type"),
    substring(col("value"),61,3).alias("wind_direction"),
    substring(col("value"),64,1).alias("wind_direction_qual"),
    substring(col("value"),65,1).alias("wind_observation"),
    (substring(col("value"),66,4).cast("float") / lit(10.0)).alias("wind_speed"),
    substring(col("value"),70,1).alias("wind_speed_qual"),
    (substring(col("value"),88,5).cast("float") / lit(10.0)).alias("air_temperature"),
    substring(col("value"),93,1).alias("air_temperature_qual")
)
    
weather.limit(10).toPandas()

Unnamed: 0,year,usaf,wban,date,time,report_type,wind_direction,wind_direction_qual,wind_observation,wind_speed,wind_speed_qual,air_temperature,air_temperature_qual
0,2003,703160,25624,20030101,0,SY-MT,10,5,N,5.2,5,-0.6,5
1,2003,703160,25624,20030101,17,FM-16,20,1,N,4.6,1,-2.0,1
2,2003,703160,25624,20030101,53,FM-15,10,5,N,5.2,5,-2.8,5
3,2003,703160,25624,20030101,100,NSRDB,999,9,9,999.9,9,999.9,9
4,2003,703160,25624,20030101,153,FM-15,10,5,N,6.2,5,-2.2,5
5,2003,703160,25624,20030101,200,NSRDB,999,9,9,999.9,9,999.9,9
6,2003,703160,25624,20030101,253,FM-15,10,5,N,7.2,5,-3.3,5
7,2003,703160,25624,20030101,300,NSRDB,999,9,9,999.9,9,999.9,9
8,2003,703160,25624,20030101,353,FM-15,20,5,N,6.2,5,-1.1,5
9,2003,703160,25624,20030101,400,NSRDB,999,9,9,999.9,9,999.9,9


## Read in Station Metadata

Fortunately station metadata is stored as CSV, so we can directly read that using Sparks `spark.read.csv` mechanisum. The data can be found at `storageLocation + '/isd-history'`.

You should also specify the `DataFrameReader` option `header` to be `True`, this will use the first line of the CSV for creating column names.

Store the result in a variable called `stations` and again print the first 10 lines using the `toPandas()` method.

In [19]:
stations = spark.read \
    .option("header", True) \
    .csv(storageLocation + "/isd-history")

# Display first 10 records    
stations.limit(10).toPandas()

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,7005,99999,CWOS 07005,,,,,,,20120127,20120127
1,7011,99999,CWOS 07011,,,,,,,20111025,20121129
2,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
3,7025,99999,CWOS 07025,,,,,,,20120127,20120127
4,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20141120
5,7034,99999,CWOS 07034,,,,,,,20121024,20121106
6,7037,99999,CWOS 07037,,,,,,,20111202,20121125
7,7044,99999,CWOS 07044,,,,,,,20120127,20120127
8,7047,99999,CWOS 07047,,,,,,,20120613,20120717
9,7052,99999,CWOS 07052,,,,,,,20121129,20121130


# Process Data

Now we want to perform a simple analysis on the data: Calculate minimum and maximum wind speed and air temperature per country and year. This needs to be performed in three steps:

1. Join weather data and stations on the columns 'usaf' and 'wban'. Note that column names are case sensitive!
2. Group the data by the relevant columns year and country
3. Perform min/max aggregations. Also pay attentions to the fields `air_temperature_qual` and `wind_speed_qual`, where "1" means valid value

**Since processing the full date range may take a considerable amount of time, you might first want to start with a single year. This can be done by temporarily replacing `raw_weather` with `raw_wather_2003`**

In [28]:
df = weather.join(stations, (weather.usaf == stations.USAF) & (weather.wban == stations.WBAN))
result = df.groupBy(df.CTRY, df.year).agg(
        min(when(df.air_temperature_qual == lit(1), df.air_temperature)).alias('min_temp'),
        max(when(df.air_temperature_qual == lit(1), df.air_temperature)).alias('max_temp'),
        min(when(df.wind_speed_qual == lit(1), df.wind_speed)).alias('min_wind'),
        max(when(df.wind_speed_qual == lit(1), df.wind_speed)).alias('max_wind')
    )

pdf = result.toPandas()    
pdf

Unnamed: 0,CTRY,year,min_temp,max_temp,min_wind,max_wind
0,CA,2006,-43.0,35.9,0.0,31.4
1,SC,2006,20.0,32.0,0.0,20.6
2,IC,2013,-11.0,19.0,0.0,25.0
3,UK,2014,-6.0,30.4,0.0,20.6
4,AM,2011,-16.0,42.0,0.0,14.0
5,SF,2012,2.2,36.0,0.0,10.3
6,GM,2005,-14.0,31.0,0.0,14.4
7,NO,2007,-35.0,29.0,0.0,26.0
8,PO,2010,-1.6,38.0,0.0,21.6
9,FR,2010,-13.3,36.1,0.0,17.5
