In [1]:
import pyspark
import random
import csv
import math
import sys
import pandas as pd
tr_path = "/home/flennic/Documents/big-data-analytics-ressources/temperature-readings-big.csv"

In [2]:
sc = pyspark.SparkContext(appName="Temperature")

**Task 1a:** What are the lowest and highest temperature measured each year for the period 1950 to 2014? Provide the list sorted in the descending order with respect to the maximum temperature. Extend the program to include the station number (not the station name) where the maximum/minimum temperature was measured.

**Answer:** As the exercise 1a) is a special case of 1), we combine them to save computation time.

In [3]:
%%time

temperature_reading = sc.textFile(tr_path)

res = temperature_reading.map(lambda l: l.split(";"))
res = res.map(lambda l: (l[1][0:4], (l[0], l[3])))
res = res.filter(lambda m: int(m[0]) >= 1950 and int(m[0]) <= 2014)

res_max = res.reduceByKey(lambda x, y: max(x, y, key=lambda z: float(z[1])))
res_max = res_max.map(lambda m: (m[0], m[1][1], m[1][0]))
res_min = res.reduceByKey(lambda x, y: min(x, y, key=lambda z: float(z[1])))
res_min = res_min.map(lambda m: (m[1][1], m[1][0]))

res = res_max.zip(res_min).map(lambda x: [m for sublist in x for m in sublist])
res = res.sortBy(lambda m: float(m[1]), ascending=False)

global df
df = pd.DataFrame(res.collect(), columns=['year', 'temp_max', 'tation_max', 'temp_min', 'station_min'])

CPU times: user 393 ms, sys: 136 ms, total: 530 ms
Wall time: 31min 5s


In [4]:
df.head()

Unnamed: 0,year,temp_max,tation_max,temp_min,station_min
0,1975,36.1,86200,-37.0,157860
1,1992,35.4,63600,-36.1,179960
2,1994,34.7,117160,-40.5,179960
3,2010,34.4,75250,-41.7,191910
4,2014,34.4,96560,-42.5,192840


**Task 1b**: Same exercise, this time as a non-parallel version.

In [5]:
def csv_read_lazy(csvfile, delimiter = ",", encoding = "utf-8"):
    with open(csvfile, encoding = encoding) as f:
        r = csv.reader(f, delimiter = delimiter)
        for row in r:
            yield row

In [12]:
%%time

def get_temperature_information(min_year = 1950, max_year = 2014):
    
    temp_info = {}
    temp_info["max"] = {}
    temp_info["min"] = {}
    
    for year in range(min_year, max_year + 1):
        temp_info["max"][year] = {"temp": float("-Inf"), "station": None}
        temp_info["min"][year] = {"temp": float("Inf"), "station": None}

    for row in csv_read_lazy(tr_path, delimiter = ";"):
        year = int(row[1][0:4])
        station = int(row[0])
        temp = float(row[3])
    
        if year >= min_year and year <= max_year:
            # Max
            if temp > temp_info["max"][year]["temp"]:
                temp_info["max"][year]["temp"] = temp
                temp_info["max"][year]["station"] = station
            
            # Min
            if temp < temp_info["min"][year]["temp"]:
                temp_info["min"][year]["temp"] = temp
                temp_info["min"][year]["station"] = station
                
    df = pd.DataFrame(columns=['year', 'temp_max', 'station_max', 'temp_min', 'station_min'])
        
    for year, v in temp_info["max"].items():
        if v["temp"] != float("-Inf"):
            df.loc[len(df)] = [year, v["temp"], v["station"], temp_info["min"][year]["temp"], temp_info["min"][year]["station"]]
    
    # More readable but slower
    #for k, v in temp_info["max"].copy().items():
    #    if v["temp"] == float("-Inf"):
    #        del temp_info["max"][k]
    #        del temp_info["min"][k]
    
    df["year"] = df["year"].astype(int)
    df["station_max"] = df["station_max"].astype(int)
    df["station_min"] = df["station_min"].astype(int)
    
    return df.sort_values(by=['temp_max'], ascending=False)

global df
df = get_temperature_information()

CPU times: user 15min 26s, sys: 10.8 s, total: 15min 37s
Wall time: 15min 48s


In [13]:
df.head()

Unnamed: 0,year,temp_max,station_max,temp_min,station_min
25,1975,36.1,86200,-37.0,157860
42,1992,35.4,63600,-36.1,179960
44,1994,34.7,117160,-40.5,179960
64,2014,34.4,96560,-42.5,192840
60,2010,34.4,75250,-41.7,191910
