In [3]:
import findspark
findspark.init()

In [4]:
from pyspark import SparkConf,SparkContext

In [6]:
conf = SparkConf().setMaster("local").setAppName("KeyValueRDD")
sc = SparkContext(conf=conf)

In [10]:
# rdd.map(lambda x : (x,1))  -> this is how key value RDD is formed

# rdd.reduceByKey(lambda x,y : x+y)   --> it is an action
# groupByKey()
# sortByKey()
# rdd.values()  -> to get the rdd of only values
# rdd.keys()  -> to get the rdd of only keys


# SQL style joins on 2 key-value RDDs
# join , rightOuterJoin , leftOuterJoin , cogroup , subtractByKey

# With key-value data,use mapValues() and flatMapValues() if your transformation doesn't affect the keys i.e when keys
# are not being modified

In [11]:
rdd = sc.textFile("file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files/fakefriends.csv")

In [36]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


In [37]:
def split_line(line):
    info = line.split(',')
    age = int(info[2])
    friends = int(info[3])
    return (age,friends)

In [38]:
ageFriendsRDD = rdd.map(split_line)  # (age,number of friends) , one value means one line

In [56]:
print(type(ageFriendsRDD))

<class 'pyspark.rdd.PipelinedRDD'>


In [57]:
# print(ageFriendsRDD.collect())

In [58]:
# groupedRDD = ageFriendsRDD.groupByKey()

In [59]:
# print(groupedRDD.collect())

In [60]:
modifiedAgeFriendsRDD = ageFriendsRDD.mapValues(lambda x:(x,1)) # Here you will get x as just values, not keys because
# mapValues() function is used on rdd instead of map()

In [61]:
reducedRDD = modifiedAgeFriendsRDD.reduceByKey(lambda x,y : (x[0]+y[0],x[1]+y[1]))  
# So here you would get x and y both as tuples
# in reduceByKey(lambda x,y: ...do something...) --> here in lambda function, values are passed only(not keys) 

In [63]:
print(reducedRDD.collect())
print(type(reducedRDD.collect()))

[(33, (3904, 12)), (26, (4115, 17)), (55, (3842, 13)), (40, (4264, 17)), (68, (2696, 10)), (59, (1980, 9)), (37, (2244, 9)), (54, (3615, 13)), (38, (2903, 15)), (27, (1825, 8)), (53, (1560, 7)), (57, (3106, 12)), (56, (1840, 6)), (43, (1614, 7)), (36, (2466, 10)), (22, (1445, 7)), (35, (1693, 8)), (45, (4024, 13)), (60, (1419, 7)), (67, (3434, 16)), (19, (2346, 11)), (30, (2594, 11)), (51, (2115, 7)), (25, (2172, 11)), (21, (2807, 8)), (42, (1821, 6)), (49, (1108, 6)), (48, (2814, 10)), (50, (1273, 5)), (39, (1185, 7)), (32, (2287, 11)), (58, (1282, 11)), (64, (3376, 12)), (31, (2138, 8)), (52, (3747, 11)), (24, (1169, 5)), (20, (825, 5)), (62, (2870, 13)), (41, (2417, 9)), (44, (3386, 12)), (69, (2352, 10)), (65, (1491, 5)), (61, (2306, 9)), (28, (2091, 10)), (66, (2488, 9)), (46, (2908, 13)), (29, (2591, 12)), (18, (2747, 8)), (47, (2099, 9)), (34, (1473, 6)), (63, (1536, 4)), (23, (2463, 10))]
<class 'list'>


In [66]:
print(reducedRDD.count())
print(type(reducedRDD.count()))

52
<class 'int'>


In [70]:
finalRDD = reducedRDD.mapValues(lambda x:x[0]/x[1])

In [71]:
print(finalRDD.collect())

[(33, 325.3333333333333), (26, 242.05882352941177), (55, 295.53846153846155), (40, 250.8235294117647), (68, 269.6), (59, 220.0), (37, 249.33333333333334), (54, 278.0769230769231), (38, 193.53333333333333), (27, 228.125), (53, 222.85714285714286), (57, 258.8333333333333), (56, 306.6666666666667), (43, 230.57142857142858), (36, 246.6), (22, 206.42857142857142), (35, 211.625), (45, 309.53846153846155), (60, 202.71428571428572), (67, 214.625), (19, 213.27272727272728), (30, 235.8181818181818), (51, 302.14285714285717), (25, 197.45454545454547), (21, 350.875), (42, 303.5), (49, 184.66666666666666), (48, 281.4), (50, 254.6), (39, 169.28571428571428), (32, 207.9090909090909), (58, 116.54545454545455), (64, 281.3333333333333), (31, 267.25), (52, 340.6363636363636), (24, 233.8), (20, 165.0), (62, 220.76923076923077), (41, 268.55555555555554), (44, 282.1666666666667), (69, 235.2), (65, 298.2), (61, 256.22222222222223), (28, 209.1), (66, 276.44444444444446), (46, 223.69230769230768), (29, 215.916

In [72]:
results = finalRDD.collect()

for result in results:
    print(result)

(33, 325.3333333333333)
(26, 242.05882352941177)
(55, 295.53846153846155)
(40, 250.8235294117647)
(68, 269.6)
(59, 220.0)
(37, 249.33333333333334)
(54, 278.0769230769231)
(38, 193.53333333333333)
(27, 228.125)
(53, 222.85714285714286)
(57, 258.8333333333333)
(56, 306.6666666666667)
(43, 230.57142857142858)
(36, 246.6)
(22, 206.42857142857142)
(35, 211.625)
(45, 309.53846153846155)
(60, 202.71428571428572)
(67, 214.625)
(19, 213.27272727272728)
(30, 235.8181818181818)
(51, 302.14285714285717)
(25, 197.45454545454547)
(21, 350.875)
(42, 303.5)
(49, 184.66666666666666)
(48, 281.4)
(50, 254.6)
(39, 169.28571428571428)
(32, 207.9090909090909)
(58, 116.54545454545455)
(64, 281.3333333333333)
(31, 267.25)
(52, 340.6363636363636)
(24, 233.8)
(20, 165.0)
(62, 220.76923076923077)
(41, 268.55555555555554)
(44, 282.1666666666667)
(69, 235.2)
(65, 298.2)
(61, 256.22222222222223)
(28, 209.1)
(66, 276.44444444444446)
(46, 223.69230769230768)
(29, 215.91666666666666)
(18, 343.375)
(47, 233.22222222222

In [None]:
# END