In [1]:
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")

In [2]:
companies_path =  "Companies.txt"
dailyPowerConsumption_path = "DailyPowerConsumption.txt"
dataCenters_path = "DataCenters.txt"

In [3]:
output1 = "solutionOutPart1"
output2 = "solutionOutPart2"

In [4]:
# Define the rdds associated with the used input files

# CodDC,CodC,City,Country,Continent
dataCenterRDD = sc.textFile(dataCenters_path).cache()

# CodDC,Date,kWh
pwrConsRDD = sc.textFile(dailyPowerConsumption_path)

In [5]:
dataCenterRDD.collect()

['DC1,C2,Nice,France,Europe',
 'DC2,C2,Nice,France,Europe',
 'DC3,C2,Rome,Italy,Europe',
 'DC4,C3,Milan,Italy,Europe',
 'DC5,C3,Milan,Italy,Europe',
 'DC6,C3,Atlanta,USA,America',
 'DC7,C3,Atlanta,USA,America',
 'DC8,C1,Atlanta,USA,America',
 'DC9,C1,Buenos Aires,Argentina,America',
 'DC10,C1,Buenos Aires,Argentina,America']

In [6]:
pwrConsRDD.collect()

['DC1,2020/01/12,1020',
 'DC2,2020/01/12,1020',
 'DC3,2020/01/12,1020',
 'DC4,2020/01/12,1020',
 'DC5,2020/01/12,1020',
 'DC6,2020/01/12,1020',
 'DC7,2020/01/12,1020',
 'DC8,2020/01/12,1020',
 'DC9,2020/01/12,1020',
 'DC10,2020/01/12,1020',
 'DC1,2020/01/13,1020',
 'DC2,2020/01/13,1020',
 'DC3,2020/01/13,1020',
 'DC4,2020/01/13,1020',
 'DC5,2020/01/13,1020',
 'DC6,2020/01/13,1020',
 'DC7,2020/01/13,1020',
 'DC8,2020/01/13,1020',
 'DC9,2020/01/13,20',
 'DC10,2020/01/13,20',
 'DC1,2020/01/14,1020',
 'DC2,2020/01/14,1020',
 'DC3,2020/01/14,1020',
 'DC4,2020/01/14,1020',
 'DC5,2020/01/14,1020',
 'DC6,2020/01/14,1020',
 'DC7,2020/01/14,1020',
 'DC8,2020/01/14,1020',
 'DC9,2020/01/14,1020',
 'DC10,2020/01/14,320',
 'DC1,2021/01/14,120',
 'DC2,2021/01/14,120',
 'DC3,2021/01/14,120',
 'DC4,2021/01/14,120',
 'DC5,2021/01/14,120',
 'DC6,2021/01/14,1500',
 'DC7,2021/01/14,1500',
 'DC8,2021/01/14,1500',
 'DC9,2021/01/14,1500',
 'DC10,2021/01/14,1500',
 'DC1,2021/01/15,120',
 'DC2,2021/01/15,120',


In [7]:
# Count the total number of data centers worl-wide and compute the threshold (90% of the data centers)
threshold = int(dataCenterRDD.count() * 0.9)

In [8]:
print(threshold)

9


In [9]:
highPwrConsDCPerDay = pwrConsRDD.filter(lambda line: float(line.split(",")[2]) >= 1000)\
                    .map(lambda line: (line.split(",")[1], 1))\
                    .reduceByKey(lambda v1, v2: v1 + v2)

In [10]:
highPwrConsDCPerDay.collect()

[('2020/01/12', 10),
 ('2020/01/13', 8),
 ('2020/01/14', 9),
 ('2021/01/14', 5),
 ('2021/01/15', 5)]

In [11]:
res1 = highPwrConsDCPerDay.filter(lambda t: t[1] >= threshold)\
                            .keys()

In [12]:
res1.collect()

['2020/01/12', '2020/01/14']

In [13]:
#########################################
# PART 2
#########################################
# Consider the power consumptions and keep only the entries related to year 2021
# and obtain the following pairRDD
# key = codDC
# value = kWh
# and use a reduceByKey to sum the power consumption for the entire year for
# each data center

def mapCodDCpwrCons(line):
    fields = line.split(",")
    codDC = fields[0]
    pwrCons = float(fields[2])
    return (codDC, pwrCons)
    

yearlyPwrCons = pwrConsRDD.filter(lambda line: line.split(",")[1].startswith("2021"))\
                        .map(mapCodDCpwrCons)\
                        .reduceByKey(lambda v1, v2: v1 + v2)

In [14]:
# for each data center, keep the continent information
# key = codDC
# value = continent
def mapCodDCContinent(line):
    fields = line.split(",")
    codDC = fields[0]
    continent = fields[4]
    return (codDC, continent)


dcAndContinent = dataCenterRDD.map(mapCodDCContinent)

In [15]:
# Join yearlyPwrCons with dcAndContinent and
# returns pairs
# key = continent
# value = (+1, kWhPerDataCenter2021)
continentOnePwr = yearlyPwrCons.join(dcAndContinent)\
                .map(lambda t: (t[1][1], (1, t[1][0])))

In [16]:
# Sum the value parts to compute for each continent
# the number of data centers and the total power consumption in the year 2021.
# key = continent
# value = (the number of data centers, avg power consumption in the year 2021)
#
# Finally, compute the avg power consumption for each continent
numDCandAvgPwrCons = continentOnePwr\
            .reduceByKey(lambda t1, t2: (t1[0] + t2[0], t1[1] + t2[1]))\
            .mapValues(lambda t: (t[0], t[1]/t[0])).cache()

In [17]:
# compute the maximum number of data centers and the maximum avg consumption in
# the 2021 among the continents
maxDCAndConsumptionPerContinentThresholds = numDCandAvgPwrCons\
            .values()\
            .reduce(lambda t1, t2: (max(t1[0],t2[0]), max(t1[1],t2[1])))

In [18]:
# select only those continents for which both constraints are satisfied
res2 = numDCandAvgPwrCons\
            .filter(lambda t: (t[1][0]==maxDCAndConsumptionPerContinentThresholds[0] and \
                              t[1][1]==maxDCAndConsumptionPerContinentThresholds[1]))\
            .keys()

In [19]:
res2.collect()

['America']