In [1]:
from pyspark import SparkConf, SparkContext
import collections

In [2]:
# configures the SparkContext => local machine and not cluster, sets the app name
conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext(conf = conf)
sc

In [3]:
def parser(line):
    line = line.split(",")
    location = line[0]
    date = line[1]
    temp = int(line[3])
    return (location, (date, temp))


In [5]:
# filters by TMIN (min temperature) and parses the file
rdd = sc.textFile("temperatures_1800.csv").filter(lambda x: "TMAX" in x).map(parser)
rdd.take(5)


[('ITE00100554', ('18000101', -75)),
 ('EZE00100082', ('18000101', -86)),
 ('ITE00100554', ('18000102', -60)),
 ('EZE00100082', ('18000102', -44)),
 ('ITE00100554', ('18000103', -23))]

In [6]:
# reduce by key (location) and returns lowest temperature
rdd_min = rdd.reduceByKey(lambda x, y: (y[0] if x[1] < y[1] else x[0], max(x[1], y[1])))
rdd_min.collect()


[('ITE00100554', ('18000819', 323)), ('EZE00100082', ('18000818', 323))]

In [7]:
for results in rdd_min.collect():
    print(results[0], results[1][1])
    

ITE00100554 323
EZE00100082 323
