# TASK A

In [22]:
inputPath = "/user/s292129/data/exam_ex3_data/PerformanceLog.txt"
outputPath1 = "res_out_ExExemple3_1/"
outputPath2 = "res_out_ExExemple3_2/"
CPUthr = 10.0
RAMthr = 1.5

In [23]:
inputRDD = sc.textFile(inputPath)

In [24]:
filteredRDD = inputRDD.filter(lambda line: line.startswith("2018/05/")).cache()

In [25]:
cpuUsagesCountRDD = filteredRDD.map(lambda line: ((line.split(",")[2], line.split(",")[1].split(":")[0]),\
                                              (line.split(",")[3], 1)))

In [26]:
ramUsagesCountRDD = filteredRDD.map(lambda line: ((line.split(",")[2], line.split(",")[1].split(":")[0]),\
                                              (line.split(",")[4], 1)))

In [27]:
def computeSumCount(value_pair1, value_pair2):
    currUsage1 = float(value_pair1[0])
    currCount1 = value_pair1[1]
    
    currUsage2 = float(value_pair2[0])
    currCount2 = value_pair2[1]
    
    return (currUsage1 + currUsage2, currCount1 + currCount2)

# Alternative: combineByKey( lambda inputElem: (inputElem, 1)\
#                            lambda intermed, inputElem:(intermed[0]+inputElem, intermed[1]+1)
#                            lambda intermed1, intermed2: (intermed1[0]+intermed2[0], intermed1[1]+intermed2[1])
# after -> mapValues(lambda v: v[0]/v[1])

In [29]:
cpuSumCountRDD = cpuUsagesCountRDD.reduceByKey(computeSumCount)
averageCpuUsageRDD = cpuSumCountRDD.mapValues(lambda v: float(v[0])/v[1])

ramSumCountRDD = ramUsagesCountRDD.reduceByKey(computeSumCount)
averageRamUsageRDD = ramSumCountRDD.mapValues(lambda v: float(v[0])/v[1])

In [33]:
filteredCpuRDD = averageCpuUsageRDD.filter(lambda pair: pair[1] > CPUthr)
filteredRamRDD = averageRamUsageRDD.filter(lambda pair: pair[1] > RAMthr)

finalRDD = filteredCpuRDD.join(filteredRamRDD)
# the alternative by prof is more efficient (no final join op.)

In [None]:
finalRDD.keys().saveAsTextFile(outputPath1)

# DATAFRAME BASED SOLUTION


In [None]:
inputDF = spark.read.load(inputPath,\
                          format="csv",\
                          header=False,\
                          inferSchema=True).withColumnRename("c0_","...")
...

inputDF.createOrReplaceTempView("statistics")

In [None]:
def extractHour(time):
    hour = time.split(":")[0]
    return hour

spark.udf.register("extractHour", extractHour)

In [None]:
selected_vsIDHour = inputDF.sql("SELECT vsID, extractHour(time) as Hour \
            FROM statistics \
            WHERE date >= '2018/05/01' AND date <= '2018/05/31' \
            GROUP BY vsID, extractHour(time) \
            HAVING avg(cpuUsage) > "+CPUthr+" AND avg(ramUsage) > "+RAMthr)

In [None]:
selected_vsIDHour.write.csv(outputPath, header=False)

# TASK B  - RDD Based solution

In [54]:
# extract ((vsID, date, hour), cpuUsage) pairs
vsID_dateRDD = filteredRDD.map(lambda line: ((line.split(',')[2], line.split(',')[0], int((line.split(',')[1]).split(':')[0])), \
                               float(line.split(',')[3]) ) )

In [55]:
key_maxPerHourRDD = vsID_dateRDD.reduceByKey(lambda v1, v2:\
                                             max(v1, v2))

In [56]:
key_maxPerHourRDD = key_maxPerHourRDD.filter(lambda pair: \
                                    pair[1] > 90 or pair[1] < 10)

In [64]:
##### Alternative Method #########
def mapMaxValue(pair):
    vsID = pair[0][0]
    date = pair[0][1]
    maxValue = pair[1]
    if maxValue > 90:
        return ((vsID, date), (1, 0))
    elif maxValue < 10:
        return ((vsID, date), (0, 1))


mappedRDD = key_maxPerHourRDD.map(mapMaxValue)
keys_countersRDD = mappedRDD.reduceByKey(lambda v1, v2: (v1[0]+v2[0], v1[1]+v2[1]))
outputRDD = keys_countersRDD.filter(lambda pair: pair[1][0] >= 8\
                        and pair[1][1] >= 8).keys()
#######################################

In [58]:
key_hourMaxRDD = key_maxPerHourRDD.map(lambda pair:\
                                       ((pair[0][0], pair[0][1]),\
                                      (pair[0][2], pair[1])))

In [59]:
groupedRDD = key_hourMaxRDD.groupByKey()

In [61]:
def countForUnbalanced(listOfTuples):
    hoursGreatUsagesList = []
    hoursLowUsagesList = []
    for tuple_i in listOfTuples:
        
        if tuple_i[1] > 90:
            if tuple_i[0] not in hoursGreatUsagesList:
                hoursGreatUsagesList.append(tuple_i[0])
        elif tuple_i[1] < 10:
            if tuple_i[0] not in hoursLowUsagesList:
                hoursLowUsagesList.append(tuple_i[0])
                
    return (len(hoursGreatUsagesList), len(hoursLowUsagesList))

keys_countersRDD = groupedRDD.mapValues(countForUnbalanced)            

In [63]:
keys_countersRDD.collect()

[(('VS3', '2018/05/01'), (15, 8)), (('VS2', '2018/05/01'), (1, 0))]

In [62]:
outputRDD = keys_countersRDD.filter(lambda pair: pair[1][0] >= 8\
                                           and pair[1][1] >= 8).keys()
outputRDD.collect()

[('VS3', '2018/05/01')]

# Task B, Sparl-SQL solution

In [None]:
spark.sql("SELECT vsID, date, extractHour(time) AS Hour,\
           max(cpuUsage) as MaxCPU \
          FROM statistics \
          WHERE date >= '2018/05/01' AND date <= '2018/05/31' \
          GROUP BY vsID, date, Hour \
          HAVING max(cpuUsage) > 90 OR max(cpuUsage) < 10" )\
.createOrReplaceTempView("tempView")

query1 = spark.sql("SELECT vsID, date \
           FROM tempView \
           GROUP BY vsID, date \
           WHERE cpuUsage > 90 \
           HAVING count(*) >= 8")

query2 = spark.sql("SELECT vsID, date \
           FROM tempView \
           GROUP BY vsID, date \
           WHERE cpuUsage < 10 \
           HAVING count(*) >= 8")

outputDF = query1.intersect(query2)

In [None]:
## ALTERNATIVE BY PROF
def greathan90(value):
    if value > 90:
        return 1
    else 
        return 0
    
def lessthan10(value):
    if value < 10:
        return 1
    else 
        return 0

spark.udf.register(...)

query = spark.sql("...HAVING SUM(greathan90(cpuUsage) >= 8 \
                    AND SUM(lessthan10) >= 8....")