## Exercise 1
### - Given a text file of 10,000 lines, each line contains a pair of (key,value)
### - Calculate the average value for each key
### - Apply groupByKey() and reduceByKey() functions
Compare?

### Initialize Spark

In [5]:
import findspark
findspark.init()

### Generate text file with 10,000 lines of (key,value)

In [3]:
# generate a text file of 10,000 lines, each line contains a pair of (key, value) 
# where key is a random integer between 1 and 1000, and value is a random integer between 1 and 10000
# the key-value pairs are separated by a tab character
import random
num_lines = 10000

file_name = "data/keyValue.txt"
with open(file_name, "w+") as f:
    for i in range(num_lines):
        key = random.randint(1, 1000)
        value = random.randint(1, 10000)
        f.write(f"{key},{value}\n")

print(f"Generated {num_lines} lines of key-value pairs in {file_name}")

Generated 10000 lines of key-value pairs in data/keyValue.txt


In [4]:
# generate a text file of 10,000 lines, each line contains a pair of (key, value) 
# where key is a random integer between 1 and 1000, and value is a random integer between 1 and 10000
# the key-value pairs are separated by a tab character
import random
num_lines = 10000

file_name = "data/keyValuePairs.txt"
with open(file_name, "w+") as f:
    for i in range(num_lines):
        # key is random character between a and z
        key = chr(random.randint(65,90))
        value = random.randint(1, 10000)
        f.write(f"{key},{value}\n")

print(f"Generated {num_lines} lines of key-value pairs in {file_name}")

Generated 10000 lines of key-value pairs in data/keyValuePairs.txt


## Operations

**Create a Spark Session**

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("KeyValuePairs") \
    .getOrCreate()

**Load the dataset**

In [8]:
keyValue_rdd = spark.sparkContext.textFile("data/keyValuePairs.txt")

In [9]:
keyValue_rdd.take(5)

['U,3590', 'I,7294', 'Y,9201', 'N,4197', 'U,1573']

**Parse the pairs**

In [10]:
keyValue_rdd.count()

10000

In [11]:
# pairs = keyValue_rdd.map(lambda line: tuple(map(int, line.split(','))))
pairs = keyValue_rdd.map(lambda line: (line.split(',')[0], int(line.split(',')[1])))

In [12]:
pairs.collect()

[('U', 3590),
 ('I', 7294),
 ('Y', 9201),
 ('N', 4197),
 ('U', 1573),
 ('A', 503),
 ('T', 4148),
 ('X', 2174),
 ('G', 6057),
 ('Q', 8936),
 ('X', 8858),
 ('T', 9712),
 ('J', 4137),
 ('Q', 6995),
 ('U', 1894),
 ('A', 8576),
 ('C', 805),
 ('Z', 4456),
 ('Z', 5993),
 ('S', 9932),
 ('I', 9406),
 ('Q', 117),
 ('T', 6087),
 ('L', 7589),
 ('Y', 4130),
 ('A', 2204),
 ('G', 2110),
 ('Y', 9713),
 ('F', 3184),
 ('J', 3779),
 ('Y', 1023),
 ('W', 9305),
 ('H', 9806),
 ('N', 4661),
 ('Q', 6383),
 ('S', 1449),
 ('D', 1047),
 ('V', 3125),
 ('V', 5470),
 ('W', 7922),
 ('J', 6908),
 ('N', 2942),
 ('F', 7344),
 ('G', 6407),
 ('L', 9035),
 ('F', 9025),
 ('V', 4827),
 ('A', 8174),
 ('G', 7377),
 ('R', 4026),
 ('K', 6818),
 ('K', 5526),
 ('P', 8116),
 ('F', 3415),
 ('L', 3739),
 ('L', 967),
 ('I', 7606),
 ('S', 1621),
 ('F', 9040),
 ('P', 3841),
 ('I', 8005),
 ('Z', 7072),
 ('O', 7809),
 ('M', 4958),
 ('B', 3185),
 ('H', 279),
 ('H', 8440),
 ('J', 6818),
 ('F', 9693),
 ('R', 3501),
 ('E', 6839),
 ('B', 4981

In [13]:
# pairs.collect()
pairs.take(5)

[('U', 3590), ('I', 7294), ('Y', 9201), ('N', 4197), ('U', 1573)]

#### Calculate average value for each key applying groupByKey() and reduceByKey() functions

In [14]:
# Group by key
grouped = pairs.groupByKey()
grouped.collect()

[('N', <pyspark.resultiterable.ResultIterable at 0x1ca1bac3dc0>),
 ('J', <pyspark.resultiterable.ResultIterable at 0x1ca1bac3d00>),
 ('C', <pyspark.resultiterable.ResultIterable at 0x1ca1bac23e0>),
 ('S', <pyspark.resultiterable.ResultIterable at 0x1ca1bac3550>),
 ('L', <pyspark.resultiterable.ResultIterable at 0x1ca1bac34f0>),
 ('W', <pyspark.resultiterable.ResultIterable at 0x1ca1bac3490>),
 ('R', <pyspark.resultiterable.ResultIterable at 0x1ca1bac3430>),
 ('K', <pyspark.resultiterable.ResultIterable at 0x1ca1bac33d0>),
 ('O', <pyspark.resultiterable.ResultIterable at 0x1ca1bac1d20>),
 ('U', <pyspark.resultiterable.ResultIterable at 0x1ca1bac0a90>),
 ('I', <pyspark.resultiterable.ResultIterable at 0x1ca1bac36a0>),
 ('Y', <pyspark.resultiterable.ResultIterable at 0x1ca1bac1f90>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x1ca1bac3c40>),
 ('T', <pyspark.resultiterable.ResultIterable at 0x1ca1bac1330>),
 ('X', <pyspark.resultiterable.ResultIterable at 0x1ca1bac1b10>),
 ('G', <py

In [15]:
averages_groupByKey = grouped.mapValues(lambda values: sum(values) / len(values))

# Collect and print the results
averages_groupByKey_result = averages_groupByKey.collect()
print("Averages using groupByKey():", averages_groupByKey_result)

Averages using groupByKey(): [('N', 5300.175710594315), ('J', 4966.862842892768), ('C', 5078.232876712329), ('S', 4881.608), ('L', 5007.2691358024695), ('W', 5132.850602409639), ('R', 4851.172588832487), ('K', 4781.3378378378375), ('O', 4785.649253731343), ('U', 5012.187335092348), ('I', 5070.219178082192), ('Y', 4936.622784810126), ('A', 5251.349246231156), ('T', 5069.953488372093), ('X', 5192.601648351649), ('G', 5039.108747044917), ('Q', 5065.3730964467), ('Z', 5191.403183023873), ('F', 5077.657458563536), ('H', 4961.325242718447), ('D', 5042.558265582656), ('V', 5062.815789473684), ('P', 4910.549872122762), ('M', 5221.0329670329675), ('B', 4815.994520547945), ('E', 5037.313019390582)]


In [16]:
import time

In [17]:
# Start timing
start_time_groupByKey = time.time()

# Group by key and count occurrences
grouped_pairs = pairs.groupByKey()
counts_groupByKey = grouped_pairs.mapValues(len)

# Collect and print the results
counts_groupByKey_result = counts_groupByKey.collect()
end_time_groupByKey = time.time()

print("Counts using groupByKey():", counts_groupByKey_result)
print("Time taken by groupByKey():", end_time_groupByKey - start_time_groupByKey, "seconds")


Counts using groupByKey(): [('N', 387), ('J', 401), ('C', 365), ('S', 375), ('L', 405), ('W', 415), ('R', 394), ('K', 370), ('O', 402), ('U', 379), ('I', 365), ('Y', 395), ('A', 398), ('T', 387), ('X', 364), ('G', 423), ('Q', 394), ('Z', 377), ('F', 362), ('H', 412), ('D', 369), ('V', 380), ('P', 391), ('M', 364), ('B', 365), ('E', 361)]
Time taken by groupByKey(): 3.1848678588867188 seconds


In [18]:
# Reduce by key
sum_counts = pairs.mapValues(lambda x: (x, 1))\
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

averages_reduceByKey = sum_counts.mapValues(lambda x: x[0]/x[1])

averages_reduceByKey_result = averages_reduceByKey.collect()
print("Averages using reduceByKey():", averages_reduceByKey_result)

Averages using reduceByKey(): [('N', 5300.175710594315), ('J', 4966.862842892768), ('C', 5078.232876712329), ('S', 4881.608), ('L', 5007.2691358024695), ('W', 5132.850602409639), ('R', 4851.172588832487), ('K', 4781.3378378378375), ('O', 4785.649253731343), ('U', 5012.187335092348), ('I', 5070.219178082192), ('Y', 4936.622784810126), ('A', 5251.349246231156), ('T', 5069.953488372093), ('X', 5192.601648351649), ('G', 5039.108747044917), ('Q', 5065.3730964467), ('Z', 5191.403183023873), ('F', 5077.657458563536), ('H', 4961.325242718447), ('D', 5042.558265582656), ('V', 5062.815789473684), ('P', 4910.549872122762), ('M', 5221.0329670329675), ('B', 4815.994520547945), ('E', 5037.313019390582)]


In [19]:
# Start timing
start_time_reduceByKey = time.time()

# Count occurrences using reduceByKey
counts_reduceByKey = pairs.mapValues(lambda value: 1).reduceByKey(lambda x, y: x + y)

# Collect and print the results
counts_reduceByKey_result = counts_reduceByKey.collect()
end_time_reduceByKey = time.time()

print("Counts using reduceByKey():", counts_reduceByKey_result)
print("Time taken by reduceByKey():", end_time_reduceByKey - start_time_reduceByKey, "seconds")


Counts using reduceByKey(): [('N', 387), ('J', 401), ('C', 365), ('S', 375), ('L', 405), ('W', 415), ('R', 394), ('K', 370), ('O', 402), ('U', 379), ('I', 365), ('Y', 395), ('A', 398), ('T', 387), ('X', 364), ('G', 423), ('Q', 394), ('Z', 377), ('F', 362), ('H', 412), ('D', 369), ('V', 380), ('P', 391), ('M', 364), ('B', 365), ('E', 361)]
Time taken by reduceByKey(): 3.404052734375 seconds


In [20]:
# Compare the results
groupByKey_dict = dict(averages_groupByKey_result)
reduceByKey_dict = dict(averages_reduceByKey_result)

comparison = [(key, groupByKey_dict[key], reduceByKey_dict[key]) for key in groupByKey_dict if groupByKey_dict[key] != reduceByKey_dict[key]]

if not comparison:
    print("The results are the same.")
else:
    print("Differences found:", comparison)

The results are the same.
