# Weather Data Analytics
This notebook performs some basic weather data analytics using the PySpark RDD interface.

## Helper Methods
First we need some helper methods for converting the raw data into something that we can work with. We decide to use Python dictionaries instead of classes, since custom classes cannot be used within Zeppelin due to serialization issues

In [3]:
def _get_float(str):
    """
    Helper method for converting a string to a float. If this is not possible, None will be returned instead
    """
    if len(str) == 0:
        return None
    try:
        return float(str)
    except ValueError:
        return None


def extract_station(line):
    """
    Extract weather station data from a raw CSV line
    """
    raw_columns = line.split(',')
    columns = [c.replace('"','') for c in raw_columns]

    usaf = columns[0]
    wban = columns[1]
    name = columns[2]
    country = columns[3]
    state = columns[4]
    icao = columns[5]
    latitude = _get_float(columns[6])
    longitude = _get_float(columns[7])
    elevation = _get_float(columns[8])
    date_begin = columns[9]
    date_end = columns[10]
    return {
            'usaf':usaf, 
            'wban':wban, 
            'name':name,
            'country':country, 
            'state':state, 
            'icao':icao, 
            'latitude':latitude, 
            'longitude':longitude, 
            'elevation':elevation, 
            'date_begin':date_begin, 
            'date_end':date_end 
           }


def extract_weather(line):
    """
    Extract weather data from a raw data line.
    """
    date = line[15:23]
    time = line[23:27]
    usaf = line[4:10]
    wban = line[10:15]
    airTemperatureQuality = line[92] == '1'
    airTemperature = float(line[87:92]) / 10
    windSpeedQuality = line[69] == '1'
    windSpeed = float(line[65:69]) / 10
    return {
            'date':date, 
            'time':time, 
            'usaf':usaf, 
            'wban':wban, 
            'airTemperatureQuality':airTemperatureQuality, 
            'airTemperature':airTemperature, 
            'windSpeedQuality':windSpeedQuality, 
            'windSpeed':windSpeed 
        }

## Test extraction methods

In [5]:
stations = sc.textFile('/user/cloudera/data/weather/isd-history.csv').map(lambda line: extract_station(line))
for s in stations.take(5):
    print s

{'date_begin': u'BEGIN', 'name': u'STATION NAME', 'country': u'CTRY', 'date_end': u'END', 'usaf': u'USAF', 'longitude': None, 'icao': u'ICAO', 'state': u'STATE', 'wban': u'WBAN', 'latitude': None, 'elevation': None}
{'date_begin': u'20120127', 'name': u'CWOS 07005', 'country': u'', 'date_end': u'20120127', 'usaf': u'007005', 'longitude': None, 'icao': u'', 'state': u'', 'wban': u'99999', 'latitude': None, 'elevation': None}
{'date_begin': u'20111025', 'name': u'CWOS 07011', 'country': u'', 'date_end': u'20121129', 'usaf': u'007011', 'longitude': None, 'icao': u'', 'state': u'', 'wban': u'99999', 'latitude': None, 'elevation': None}
{'date_begin': u'20110309', 'name': u'WXPOD 7018', 'country': u'', 'date_end': u'20130730', 'usaf': u'007018', 'longitude': 0.0, 'icao': u'', 'state': u'', 'wban': u'99999', 'latitude': 0.0, 'elevation': 7018.0}
{'date_begin': u'20120127', 'name': u'CWOS 07025', 'country': u'', 'date_end': u'20120127', 'usaf': u'007025', 'longitude': None, 'icao': u'', 'stat

In [6]:
weather = sc.textFile('/user/cloudera/data/weather/2014').map(lambda line: extract_weather(line))
for w in weather.take(5):
    print w

{'airTemperature': -13.6, 'windSpeedQuality': True, 'usaf': u'010060', 'windSpeed': 3.0, 'wban': u'99999', 'time': u'0100', 'date': u'20140101', 'airTemperatureQuality': True}
{'airTemperature': -14.2, 'windSpeedQuality': True, 'usaf': u'010060', 'windSpeed': 2.0, 'wban': u'99999', 'time': u'0200', 'date': u'20140101', 'airTemperatureQuality': True}
{'airTemperature': -10.7, 'windSpeedQuality': True, 'usaf': u'010060', 'windSpeed': 4.0, 'wban': u'99999', 'time': u'0400', 'date': u'20140101', 'airTemperatureQuality': True}
{'airTemperature': -11.2, 'windSpeedQuality': True, 'usaf': u'010060', 'windSpeed': 3.0, 'wban': u'99999', 'time': u'0500', 'date': u'20140101', 'airTemperatureQuality': True}
{'airTemperature': -10.0, 'windSpeedQuality': True, 'usaf': u'010060', 'windSpeed': 5.0, 'wban': u'99999', 'time': u'0600', 'date': u'20140101', 'airTemperatureQuality': True}


# Join Data Sets

In order to analyse the data, we need to join the weather data with the station data, so we can get more detailed information where the weather actually was recorded.

In [7]:
station_index = stations.keyBy(lambda data: data['usaf'] + data['wban'])
weather_index = weather.keyBy(lambda data: data['usaf'] + data['wban'])
joined_weather = weather_index.join(station_index)

for d in joined_weather.take(5):
    print d

(u'71060099999', ({'airTemperature': -6.6, 'windSpeedQuality': True, 'usaf': u'710600', 'windSpeed': 1.5, 'wban': u'99999', 'time': u'0000', 'date': u'20140101', 'airTemperatureQuality': True}, {'date_begin': u'19840101', 'name': u'NORDEGG CS  ALTA', 'country': u'CA', 'date_end': u'20151121', 'usaf': u'710600', 'longitude': -116.067, 'icao': u'CXND', 'state': u'', 'wban': u'99999', 'latitude': 52.467, 'elevation': 1362.0}))
(u'71060099999', ({'airTemperature': -7.6, 'windSpeedQuality': True, 'usaf': u'710600', 'windSpeed': 1.5, 'wban': u'99999', 'time': u'0000', 'date': u'20140101', 'airTemperatureQuality': True}, {'date_begin': u'19840101', 'name': u'NORDEGG CS  ALTA', 'country': u'CA', 'date_end': u'20151121', 'usaf': u'710600', 'longitude': -116.067, 'icao': u'CXND', 'state': u'', 'wban': u'99999', 'latitude': 52.467, 'elevation': 1362.0}))
(u'71060099999', ({'airTemperature': -7.0, 'windSpeedQuality': True, 'usaf': u'710600', 'windSpeed': 2.1, 'wban': u'99999', 'time': u'0100', 'da

## Create appropriate Keys
We want to analyze the data grouped by country and year. So we need to create appropriate keys.

In [8]:
def extract_country_year_weather(data):
    return ((data[1][1]['country'], data[1][0]['date'][0:4]), data[1][0])

weather_per_country_and_year = joined_weather.map(extract_country_year_weather)

## Perform Aggregation
We want to extract minimum and maximum of wind speed and of temperature. We also want to consider cases where data is not valid (i.e. windSpeedQuality is False or airTemperature is False).

We will implement custom aggregation functions that work on dictionaries

In [9]:
def nullsafe_min(a, b):
    """
    Helper method for taking the min of two values. Also gracefully handles None values
    """
    from __builtin__ import min
    if a is None:
        return b
    if b is None:
        return a
    return min(a,b)


def nullsafe_max(a, b):
    """
    Helper method for taking the max of two values. Also gracefully handles None values
    """
    from __builtin__ import max
    if a is None:
        return b
    if b is None:
        return a
    return max(a, b)


# Neutral value used in aggregation
zero_wmm = { 'minTemperature':None, 'maxTemperature':None, 'minWindSpeed':None, 'maxWindSpeed':None }


def reduce_wmm(wmm, data):
    """
    Used for merging in a new weather data set into an existing WeatherMinMax object. The incoming
    objects will not be modified, instead a new object will be returned.
    :param wmm: A Python dictionary representing min/max information
    :param data: A Python dictionary representring weather measurement information
    :returns: A new Python dictionary representing min/max information
    """
    if data['airTemperatureQuality']:
        minTemperature = nullsafe_min(wmm['minTemperature'], data['airTemperature'])
        maxTemperature = nullsafe_max(wmm['maxTemperature'], data['airTemperature'])
    else:
        minTemperature = wmm['minTemperature']
        maxTemperature = wmm['maxTemperature']

    if data['windSpeedQuality']:
        minWindSpeed = nullsafe_min(wmm['minWindSpeed'], data['windSpeed'])
        maxWindSpeed = nullsafe_max(wmm['maxWindSpeed'], data['windSpeed'])
    else:
        minWindSpeed = wmm['minWindSpeed']
        maxWindSpeed = wmm['maxWindSpeed']

    return { 'minTemperature':minTemperature, 'maxTemperature':maxTemperature, 'minWindSpeed':minWindSpeed, 'maxWindSpeed':maxWindSpeed }


def combine_wmm(left, right):
    """
    Used for combining two WeatherMinMax objects into a new WeatherMinMax object
    :param self: First Python dictionary representing min/max information
    :param other: Second Python dictionary representing min/max information
    :returns: A new Python dictionary representing combined min/max information
    """
    minTemperature = nullsafe_min(left['minTemperature'], right['minTemperature'])
    maxTemperature = nullsafe_max(left['maxTemperature'], right['maxTemperature'])
    minWindSpeed = nullsafe_min(left['minWindSpeed'], right['minWindSpeed'])
    maxWindSpeed = nullsafe_max(left['maxWindSpeed'], right['maxWindSpeed'])

    return { 'minTemperature':minTemperature, 'maxTemperature':maxTemperature, 'minWindSpeed':minWindSpeed, 'maxWindSpeed':maxWindSpeed }

In [10]:
# Aggregate min/max information per year and country
weather_minmax = weather_per_country_and_year.aggregateByKey(zero_wmm,reduce_wmm, combine_wmm)

for m in weather_minmax.take(5):
    print m


((u'EZ', u'2014'), {'maxWindSpeed': 16.5, 'maxTemperature': 33.0, 'minWindSpeed': 0.0, 'minTemperature': -15.0})
((u'UK', u'2014'), {'maxWindSpeed': 20.6, 'maxTemperature': 30.4, 'minWindSpeed': 0.0, 'minTemperature': -6.0})
((u'SF', u'2014'), {'maxWindSpeed': 13.4, 'maxTemperature': 37.4, 'minWindSpeed': 0.0, 'minTemperature': 0.9})
((u'SC', u'2014'), {'maxWindSpeed': 30.4, 'maxTemperature': 32.0, 'minWindSpeed': 0.0, 'minTemperature': 20.0})
((u'CH', u'2014'), {'maxWindSpeed': 15.0, 'maxTemperature': 34.0, 'minWindSpeed': 0.0, 'minTemperature': 11.0})


# Format Output

We want to create CSV data, so we need to reformat the Python dicts to nicely looking strings

In [11]:
def format_result(row):
    # Every row contains the key and the data.
    #   key is (country, year)
    #   value is Python dictionary containing min/max information
    (k,v) = row
    country = k[0]
    year = k[1]
    minT = v['minTemperature'] or 0.0
    maxT = v['maxTemperature'] or 0.0
    minW = v['minWindSpeed'] or 0.0
    maxW = v['maxWindSpeed'] or 0.0
    line = "%s,%s,%f,%f,%f,%f" % (country, year, minT, maxT, minW, maxW)
    # Encode as UTF-8, or we might experience some problems
    return line.encode('utf-8')

result = weather_minmax.map(format_result).collect()

for l in result:
    print l

EZ,2014,-15.000000,33.000000,0.000000,16.500000
UK,2014,-6.000000,30.400000,0.000000,20.600000
SF,2014,0.900000,37.400000,0.000000,13.400000
SC,2014,20.000000,32.000000,0.000000,30.400000
CH,2014,11.000000,34.000000,0.000000,15.000000
SW,2014,-34.500000,28.900000,1.000000,16.000000
IT,2014,-6.800000,24.000000,0.000000,20.600000
FI,2014,-28.600000,30.300000,0.000000,18.000000
US,2014,-37.200000,41.200000,0.000000,31.000000
AS,2014,0.900000,45.600000,0.000000,14.400000
JA,2014,-0.500000,33.900000,0.000000,19.600000
PO,2014,-1.000000,32.000000,0.000000,15.400000
GK,2014,2.000000,24.000000,0.000000,21.100000
MY,2014,19.000000,36.000000,0.000000,9.800000
AU,2014,-11.000000,34.000000,0.000000,16.500000
DA,2014,-9.000000,30.200000,0.000000,17.000000
GM,2014,-9.000000,31.000000,0.000000,13.400000
FR,2014,-9.000000,36.100000,0.000000,16.500000
PL,2014,-15.000000,32.000000,0.000000,14.900000
RS,2014,-28.900000,30.500000,0.000000,11.000000
LU,2014,-10.000000,32.100000,0.000000,13.400000
BE,2014,-