In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("MaxTemperatures")
sc = SparkContext(conf = conf)

In [2]:
# ITE00100554, 18000101, TMIN, -148,,,E => (ITE00100554, TMIN, -234.4)...
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

In [3]:
lines = sc.textFile("file:///home/dmadhok/spark_course/1800.csv")
lines.take(5)

['ITE00100554,18000101,TMAX,-75,,,E,',
 'ITE00100554,18000101,TMIN,-148,,,E,',
 'GM000010962,18000101,PRCP,0,,,E,',
 'EZE00100082,18000101,TMAX,-86,,,E,',
 'EZE00100082,18000101,TMIN,-135,,,E,']

In [4]:
parsedLines = lines.map(parseLine)
parsedLines.take(5)

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 16.52),
 ('EZE00100082', 'TMIN', 7.699999999999999)]

In [5]:
minTemps = parsedLines.filter(lambda x: "TMAX" in x[1])
minTemps.take(5)

[('ITE00100554', 'TMAX', 18.5),
 ('EZE00100082', 'TMAX', 16.52),
 ('ITE00100554', 'TMAX', 21.2),
 ('EZE00100082', 'TMAX', 24.08),
 ('ITE00100554', 'TMAX', 27.86)]

In [6]:
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
stationTemps.take(5)

[('ITE00100554', 18.5),
 ('EZE00100082', 16.52),
 ('ITE00100554', 21.2),
 ('EZE00100082', 24.08),
 ('ITE00100554', 27.86)]

In [7]:
# Find the max temperature for each weather station
minTemps = stationTemps.reduceByKey(lambda x, y: max(x,y))
minTemps.take(5)

[('ITE00100554', 90.14000000000001), ('EZE00100082', 90.14000000000001)]

In [8]:
results= minTemps.collect()

In [9]:
for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

ITE00100554	90.14F
EZE00100082	90.14F
