In [14]:
register_path = "/data/students/bigdata-01QYD/Lab6_DBD/register.csv"
stations_path = "/data/students/bigdata-01QYD/Lab6_DBD/stations.csv"
outputPath = "res_out_Lab6/"
treshold = 0.4

In [15]:
# register.csv format: stationID\tTimeStamp\tUsedSlots\tFreeSlots
registerRDD = sc.textFile(register_path)
# we need to filter the header and the wrong data
def filterHeadANDwrongData(line):
    if line.startswith("stat"):
        return False
    lineSplit = line.split("\t")
    if int(lineSplit[2]) != 0 or int(lineSplit[3]) != 0:
        return True
    else:
        return False
    
    
filteredRegisterRDD = registerRDD.filter(filterHeadANDwrongData)

In [16]:
# --------------Task 1 ----------------
# We have to identify the most critical timeslot for each station
# "day of the week-hour" is a timeslot and is associated with all the readings associated with that pair
# criticality of a station S_i in the timeslot T_j = 
# (num of readings with num of free slots = 0 for (S_i, T_j) ) / (total num of readings in the pair (S_i, T_j)
from datetime import datetime

# Creating pairs (S_i, T_j)
def createPairs(line):
    lineSplit = line.split("\t")
    S_j = lineSplit[0]
    timestamp = lineSplit[1]
    datetimeObj = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    T_j = (datetimeObj.strftime("%a"), datetimeObj.hour)
    return ((S_j, T_j), int(lineSplit[3]))

In [17]:
pairsRDD = filteredRegisterRDD.map(createPairs)

In [18]:
pairsValueGroupedRDD = pairsRDD.groupByKey().cache()
#pairsValueGroupedRDD = pairsValueGroupedRDD.mapValues(lambda value: list(value)).collect()

In [19]:
# We compute the criticality value for each pair
def computeCriticalityValue(line):
    zeroCounter = 0
    for value in line[1]:
        if value == 0:
            zeroCounter += 1
    critValue = zeroCounter/len(line[1])
    return (line[0], critValue)
    
pairsCritValueRDD = pairsValueGroupedRDD.map(computeCriticalityValue)

In [20]:
# We select only pairs with crit value > treshold
pairsCritValueFilteredRDD = pairsCritValueRDD.filter(lambda pair: pair[1] >= treshold)

In [23]:
pairsReMapS_idTupleRDD = pairsCritValueFilteredRDD.map(lambda pair:\
                                                       (pair[0][0], (pair[0][1], pair[1])))

In [25]:
# GroupByKey to obtain a list of tuples with ((weekday, hour), criticalValue) for each station
pairsReMappedGroupedRDD = pairsReMapS_idTupleRDD.groupByKey().cache()

In [26]:
pairsReMappedGroupedRDD = pairsReMappedGroupedRDD.mapValues(lambda v: list(v))

In [27]:
# We define a function to select the most critical timeslot for each station (EASIER WITH REDUCEBYKEY)
def returnMostCritical(pair):
    s_id = pair[0]
    Tslot_CritValue_ListOfTuples = pair[1]
    maxValue = 0.1
    
    for i_tuple in Tslot_CritValue_ListOfTuples:
        if i_tuple[1] > maxValue:
            maxValue = i_tuple[1]
            
    timeSlotsWithMax = []
    for j_tuple in Tslot_CritValue_ListOfTuples:
        if j_tuple[1] == maxValue:
            timeSlotsWithMax.append(j_tuple)
            
    if len(timeSlotsWithMax) == 1:
        return (s_id, timeSlotsWithMax[0])
    else:
        minHour = 25
        for ts in timeSlotsWithMax:
            if ts[0][1] < minHour:
                minHour = ts[0][1]
        
        tsWithMinHour = []
        for ts in timeSlotsWithMax:
            if ts[0][1] == minHour:
                tsWithMinHour.append(ts)
        
        if len(tsWithMinHour) == 1:
            return (s_id, tsWithMinHour[0])
        else: 
            minDay = tsWithMinHour[0][0]
            pos = 1
            while pos < len(tsWithMinHour):
                if tsWithMinHour[pos][0] < minDay:
                    minDay = tsWithMinHour[pos][0]
            
            tsMinDay = []
            for item in tsWithMinHour:
                if item[0][0] == minDay:
                    tsMinDay.append(item)
            
            return (s_id, tsMinDay[0])

                
#-----------------------------------------------------------------------------
#Easier with ReduceByKey
pairSidMostCriticalRDD = pairsReMappedGroupedRDD.map(returnMostCritical)

In [30]:
# We need to store using a KML file: the output file must contain one marker of type
# Placemark for each pair characterized by:
# StationID, WeekDay and Hour, CriticalityValue, Longitude and Latitude

stationsRDD = sc.textFile(stations_path)

mappedStationRDD = stationsRDD.map(lambda line: (line.split("\t")[0],\
                                                 (line.split("\t")[1], line.split("\t")[2])))
joinedRDD = pairSidMostCriticalRDD.join(mappedStationRDD)
#joinedRDD.collect()

In [None]:
# Formattazione per KML file
def kmlFormatMapping(pair):
    name = pair[0]
    day = pair[1][0][0][0]
    hour = str(pair[1][0][0][1])
    critValue = str(pair[1][0][1])
    longitude = pair[1][1][0]
    latitude = pair[1][1][1]
    return "<Placemark><name>"+name+"</name><ExtendedData><Data name=""DayWeek""><value>"+day+"</value></Data><Data name=""Hour""><value>"+hour+"3</value></Data><Data name=""Criticality""><value>"\
    +critValue+"</value></Data></ExtendedData><Point><coordinates>"+longitude+","+latitude+"</coordinates></Point></Placemark>"

outputRDD = joinedRDD.map(kmlFormatMapping)
outputRDD = outputRDD.coalesce(1)
outputRDD.saveAsTextFile(outputPath)
            

In [32]:
outputRDD.count()

48