In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
DIR = '/content/gdrive/My Drive/Spark_course/data/'

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

import findspark
findspark.init("spark-2.4.5-bin-hadoop2.7")# SPARK_HOME

In [0]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext(conf = conf)

In [0]:
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

In [0]:
lines = sc.textFile("/content/gdrive/My Drive/Spark_course/data/1800.csv")

parsedLines = lines.map(parseLine)
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])


In [7]:
minTemps.collect()

[('ITE00100554', 'TMIN', 5.359999999999999),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMIN', 9.5),
 ('EZE00100082', 'TMIN', 8.599999999999998),
 ('ITE00100554', 'TMIN', 23.72),
 ('EZE00100082', 'TMIN', 18.86),
 ('ITE00100554', 'TMIN', 29.66),
 ('EZE00100082', 'TMIN', 18.68),
 ('ITE00100554', 'TMIN', 30.919999999999998),
 ('EZE00100082', 'TMIN', 21.56),
 ('ITE00100554', 'TMIN', 34.34),
 ('EZE00100082', 'TMIN', 21.740000000000002),
 ('ITE00100554', 'TMIN', 33.8),
 ('EZE00100082', 'TMIN', 23.0),
 ('ITE00100554', 'TMIN', 34.52),
 ('EZE00100082', 'TMIN', 26.42),
 ('ITE00100554', 'TMIN', 36.14),
 ('EZE00100082', 'TMIN', 23.72),
 ('ITE00100554', 'TMIN', 37.58),
 ('EZE00100082', 'TMIN', 18.5),
 ('ITE00100554', 'TMIN', 39.38),
 ('EZE00100082', 'TMIN', 20.84),
 ('ITE00100554', 'TMIN', 37.22),
 ('EZE00100082', 'TMIN', 21.2),
 ('ITE00100554', 'TMIN', 34.34),
 ('EZE00100082', 'TMIN', 21.2),
 ('ITE00100554', 'TMIN', 36.14),
 ('EZE00100082', 'TMIN', 25.7),
 ('ITE00100554', 'TMIN

In [8]:
parsedLines.collect()

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 16.52),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMAX', 21.2),
 ('ITE00100554', 'TMIN', 9.5),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 24.08),
 ('EZE00100082', 'TMIN', 8.599999999999998),
 ('ITE00100554', 'TMAX', 27.86),
 ('ITE00100554', 'TMIN', 23.72),
 ('GM000010962', 'PRCP', 32.72),
 ('EZE00100082', 'TMAX', 30.2),
 ('EZE00100082', 'TMIN', 18.86),
 ('ITE00100554', 'TMAX', 32.0),
 ('ITE00100554', 'TMIN', 29.66),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 22.1),
 ('EZE00100082', 'TMIN', 18.68),
 ('ITE00100554', 'TMAX', 33.8),
 ('ITE00100554', 'TMIN', 30.919999999999998),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 24.8),
 ('EZE00100082', 'TMIN', 21.56),
 ('ITE00100554', 'TMAX', 34.34),
 ('ITE00100554', 'TMIN', 34.34),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 24.98),
 ('EZE00

In [0]:
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
results = minTemps.collect();

In [10]:
results

[('ITE00100554', 5.359999999999999), ('EZE00100082', 7.699999999999999)]

In [11]:
for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))


ITE00100554	5.36F
EZE00100082	7.70F


In [0]:
# pandas groupby --> spark: reducedByKey