In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf,SparkContext

In [3]:
conf = SparkConf().setMaster("local").setAppName("KeyValueRDD")
sc = SparkContext(conf=conf)

In [4]:
# rdd.map(lambda x : (x,1))  -> this is how key value RDD is formed

# rdd.reduceByKey(lambda x,y : x+y)   --> it is an action
# groupByKey()
# sortByKey()
# rdd.values()  -> to get the rdd of only values
# rdd.keys()  -> to get the rdd of only keys


# SQL style joins on 2 key-value RDDs
# join , rightOuterJoin , leftOuterJoin , cogroup , subtractByKey

# With key-value data,use mapValues() and flatMapValues() if your transformation doesn't affect the keys i.e when keys
# are not being modified

In [11]:
rdd = sc.textFile("file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files_FraneKane/fakefriends.csv")

In [12]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


In [13]:
def split_line(line):
    info = line.split(',')
    age = int(info[2])
    friends = int(info[3])
    return (age,friends)

In [14]:
ageFriendsRDD = rdd.map(split_line)  # (age,number of friends) , one value means one line

In [15]:
print(type(ageFriendsRDD))

<class 'pyspark.rdd.PipelinedRDD'>


In [16]:
print(ageFriendsRDD.top(10))

[(69, 491), (69, 470), (69, 431), (69, 361), (69, 236), (69, 148), (69, 116), (69, 75), (69, 15), (69, 9)]


In [17]:
# groupedRDD = ageFriendsRDD.groupByKey()

In [18]:
# print(groupedRDD.collect())

In [19]:
modifiedAgeFriendsRDD = ageFriendsRDD.mapValues(lambda x:(x,1)) # Here you will get x as just values, not keys because
# mapValues() function is used on rdd instead of map()
print(modifiedAgeFriendsRDD.top(10))

[(69, (491, 1)), (69, (470, 1)), (69, (431, 1)), (69, (361, 1)), (69, (236, 1)), (69, (148, 1)), (69, (116, 1)), (69, (75, 1)), (69, (15, 1)), (69, (9, 1))]


In [20]:
reducedRDD = modifiedAgeFriendsRDD.reduceByKey(lambda x,y : (x[0]+y[0],x[1]+y[1]))  
# So here you would get x and y both as tuples
# in reduceByKey(lambda x,y: ...do something...) --> here in lambda function, values are passed only(not keys) 

In [25]:
print(reducedRDD.top(3))
print(type(reducedRDD.collect()))

[(69, (2352, 10)), (68, (2696, 10)), (67, (3434, 16))]
<class 'list'>


In [26]:
print(reducedRDD.count())
print(type(reducedRDD.count()))

52
<class 'int'>


In [27]:
finalRDD = reducedRDD.mapValues(lambda x:x[0]/x[1])

In [28]:
print(finalRDD.top(3))

[(69, 235.2), (68, 269.6), (67, 214.625)]


In [30]:
results = finalRDD.collect()

for result in sorted(results):
    print(result)

(18, 343.375)
(19, 213.27272727272728)
(20, 165.0)
(21, 350.875)
(22, 206.42857142857142)
(23, 246.3)
(24, 233.8)
(25, 197.45454545454547)
(26, 242.05882352941177)
(27, 228.125)
(28, 209.1)
(29, 215.91666666666666)
(30, 235.8181818181818)
(31, 267.25)
(32, 207.9090909090909)
(33, 325.3333333333333)
(34, 245.5)
(35, 211.625)
(36, 246.6)
(37, 249.33333333333334)
(38, 193.53333333333333)
(39, 169.28571428571428)
(40, 250.8235294117647)
(41, 268.55555555555554)
(42, 303.5)
(43, 230.57142857142858)
(44, 282.1666666666667)
(45, 309.53846153846155)
(46, 223.69230769230768)
(47, 233.22222222222223)
(48, 281.4)
(49, 184.66666666666666)
(50, 254.6)
(51, 302.14285714285717)
(52, 340.6363636363636)
(53, 222.85714285714286)
(54, 278.0769230769231)
(55, 295.53846153846155)
(56, 306.6666666666667)
(57, 258.8333333333333)
(58, 116.54545454545455)
(59, 220.0)
(60, 202.71428571428572)
(61, 256.22222222222223)
(62, 220.76923076923077)
(63, 384.0)
(64, 281.3333333333333)
(65, 298.2)
(66, 276.4444444444444

In [None]:
# END