In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf,SparkContext

In [3]:
conf = SparkConf().setMaster("local").setAppName("KeyValueRDD")
sc = SparkContext(conf=conf)

In [4]:
# rdd.map(lambda x : (x,1))  -> this is how key value RDD is formed

# rdd.reduceByKey(lambda x,y : x+y)   --> it is an action
# groupByKey()
# sortByKey()
# rdd.values()  -> to get the rdd of only values
# rdd.keys()  -> to get the rdd of only keys


# SQL style joins on 2 key-value RDDs
# join , rightOuterJoin , leftOuterJoin , cogroup , subtractByKey

# With key-value data,use mapValues() and flatMapValues() if your transformation doesn't affect the keys i.e when keys
# are not being modified

In [5]:
rdd = sc.textFile("file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files/fakefriends.csv")

In [6]:
print(type(rdd))

<class 'pyspark.rdd.RDD'>


In [7]:
def split_line(line):
    info = line.split(',')
    age = int(info[2])
    friends = int(info[3])
    return (age,friends)

In [8]:
ageFriendsRDD = rdd.map(split_line)  # (age,number of friends) , one value means one line

In [9]:
print(type(ageFriendsRDD))

<class 'pyspark.rdd.PipelinedRDD'>


In [11]:
print(ageFriendsRDD.top(10))

[(69, 491), (69, 470), (69, 431), (69, 361), (69, 236), (69, 148), (69, 116), (69, 75), (69, 15), (69, 9)]


In [58]:
# groupedRDD = ageFriendsRDD.groupByKey()

In [59]:
# print(groupedRDD.collect())

In [13]:
modifiedAgeFriendsRDD = ageFriendsRDD.mapValues(lambda x:(x,1)) # Here you will get x as just values, not keys because
# mapValues() function is used on rdd instead of map()
print(modifiedAgeFriendsRDD.top(10))

[(69, (491, 1)), (69, (470, 1)), (69, (431, 1)), (69, (361, 1)), (69, (236, 1)), (69, (148, 1)), (69, (116, 1)), (69, (75, 1)), (69, (15, 1)), (69, (9, 1))]


In [61]:
reducedRDD = modifiedAgeFriendsRDD.reduceByKey(lambda x,y : (x[0]+y[0],x[1]+y[1]))  
# So here you would get x and y both as tuples
# in reduceByKey(lambda x,y: ...do something...) --> here in lambda function, values are passed only(not keys) 

In [76]:
print(reducedRDD.collect())
print(type(reducedRDD.collect()))

In [66]:
print(reducedRDD.count())
print(type(reducedRDD.count()))

52
<class 'int'>


In [70]:
finalRDD = reducedRDD.mapValues(lambda x:x[0]/x[1])

In [75]:
print(finalRDD.collect())

In [74]:
results = finalRDD.collect()

for result in results:
    print(result)

In [None]:
# END