In [2]:
from pyspark import SparkConf, SparkContext
import collections

In [3]:
# configures the SparkContext => local machine and not cluster, sets the app name
conf = SparkConf().setMaster("local").setAppName("FriendsAge")
sc = SparkContext(conf = conf)
sc

In [4]:
# function that parses age, num of friends and returns the tuple (age, num of friends)
def parse_line(line):
    fields = line.split(",")
    age = int(fields[2])
    num_friends = int(fields[3])
    return (age, num_friends)


In [5]:
# create RDD
lines = sc.textFile("fakefriends.csv")


In [6]:
# map RDD by calling a parsing function
rdd = lines.map(parse_line)


In [9]:
rdd.count()

500

In [8]:
rdd.take(5)


[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]

In [10]:
# map age and (num of friends, occurrences of age)
totals_by_age = rdd.mapValues(lambda x: (x, 1))
totals_by_age.take(5)


[(33, (385, 1)), (26, (2, 1)), (55, (221, 1)), (40, (465, 1)), (68, (21, 1))]

In [11]:
# reduce by key (age) and sum both elements of tuple in value
# x and y refers to values in several entries
totals_by_age_reduce = totals_by_age.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
totals_by_age_reduce.take(5)


[(33, (3904, 12)),
 (26, (4115, 17)),
 (55, (3842, 13)),
 (40, (4264, 17)),
 (68, (2696, 10))]

In [14]:
# calculating average number of friends per age
averages_by_age = totals_by_age_reduce.mapValues(lambda x: int(x[0] / x[1])).sortByKey()
averages_by_age.collect()


[(18, 343),
 (19, 213),
 (20, 165),
 (21, 350),
 (22, 206),
 (23, 246),
 (24, 233),
 (25, 197),
 (26, 242),
 (27, 228),
 (28, 209),
 (29, 215),
 (30, 235),
 (31, 267),
 (32, 207),
 (33, 325),
 (34, 245),
 (35, 211),
 (36, 246),
 (37, 249),
 (38, 193),
 (39, 169),
 (40, 250),
 (41, 268),
 (42, 303),
 (43, 230),
 (44, 282),
 (45, 309),
 (46, 223),
 (47, 233),
 (48, 281),
 (49, 184),
 (50, 254),
 (51, 302),
 (52, 340),
 (53, 222),
 (54, 278),
 (55, 295),
 (56, 306),
 (57, 258),
 (58, 116),
 (59, 220),
 (60, 202),
 (61, 256),
 (62, 220),
 (63, 384),
 (64, 281),
 (65, 298),
 (66, 276),
 (67, 214),
 (68, 269),
 (69, 235)]