In [1]:
from pyspark import SparkConf, SparkContext
from operator import add

In [2]:
conf = SparkConf().setMaster("local").setAppName("CustomerOrdersSorted")
sc = SparkContext(conf = conf)

In [3]:
input = sc.textFile("file:///home/dmadhok/spark_course/customer-orders.csv")

In [4]:
input.take(10)

['44,8602,37.19',
 '35,5368,65.89',
 '2,3391,40.64',
 '47,6694,14.98',
 '29,680,13.08',
 '91,8900,24.59',
 '70,3959,68.68',
 '85,1733,28.53',
 '53,9900,83.55',
 '14,1505,4.32']

In [5]:
def parseLine(line):
    fields = line.split(',')
    customerID = fields[0]
    dollarAmount = float(fields[2])
    return (customerID, dollarAmount)

In [6]:
customerDollars = input.map(parseLine)
customerDollars.take(5)

[('44', 37.19), ('35', 65.89), ('2', 40.64), ('47', 14.98), ('29', 13.08)]

In [7]:
customerDollars.filter(lambda x: x[0] == '44').take(100)

[('44', 37.19),
 ('44', 99.19),
 ('44', 39.54),
 ('44', 41.88),
 ('44', 66.54),
 ('44', 66.6),
 ('44', 80.96),
 ('44', 93.51),
 ('44', 59.9),
 ('44', 16.47),
 ('44', 10.74),
 ('44', 74.67),
 ('44', 38.25),
 ('44', 29.73),
 ('44', 88.58),
 ('44', 10.13),
 ('44', 86.52),
 ('44', 45.83),
 ('44', 74.23),
 ('44', 0.76),
 ('44', 58.47),
 ('44', 93.8),
 ('44', 73.61),
 ('44', 95.25),
 ('44', 10.55),
 ('44', 93.21),
 ('44', 20.03),
 ('44', 4.04),
 ('44', 37.58),
 ('44', 21.68),
 ('44', 57.47),
 ('44', 63.84),
 ('44', 52.91),
 ('44', 42.26),
 ('44', 75.22),
 ('44', 33.82),
 ('44', 5.93),
 ('44', 56.31),
 ('44', 42.13),
 ('44', 2.71),
 ('44', 84.59),
 ('44', 29.36),
 ('44', 61.0),
 ('44', 85.09),
 ('44', 8.71),
 ('44', 99.88),
 ('44', 15.01),
 ('44', 86.23),
 ('44', 59.6),
 ('44', 15.37),
 ('44', 13.45),
 ('44', 83.93),
 ('44', 7.12),
 ('44', 49.74),
 ('44', 56.1),
 ('44', 99.64),
 ('44', 42.34),
 ('44', 43.35),
 ('44', 95.43),
 ('44', 68.35),
 ('44', 92.06),
 ('44', 77.7),
 ('44', 10.87),
 ('44

In [8]:
customerDollars = customerDollars.reduceByKey(lambda x, y: x + y)

In [9]:
customerDollars.take(5)

[('44', 4756.8899999999985),
 ('35', 5155.419999999999),
 ('2', 5994.59),
 ('47', 4316.299999999999),
 ('29', 5032.529999999999)]

In [10]:
# customerDollars.collect() returns a list from the rdd- running sorted sorts it
sorted(customerDollars.collect())

[('0', 5524.949999999998),
 ('1', 4958.600000000001),
 ('10', 4819.700000000001),
 ('11', 5152.290000000002),
 ('12', 4664.589999999998),
 ('13', 4367.62),
 ('14', 4735.030000000001),
 ('15', 5413.510000000001),
 ('16', 4979.06),
 ('17', 5032.679999999999),
 ('18', 4921.27),
 ('19', 5059.4299999999985),
 ('2', 5994.59),
 ('20', 4836.859999999999),
 ('21', 4707.41),
 ('22', 5019.449999999999),
 ('23', 4042.6499999999987),
 ('24', 5259.920000000003),
 ('25', 5057.610000000001),
 ('26', 5250.4),
 ('27', 4915.889999999999),
 ('28', 5000.709999999998),
 ('29', 5032.529999999999),
 ('3', 4659.63),
 ('30', 4990.72),
 ('31', 4765.05),
 ('32', 5496.050000000004),
 ('33', 5254.659999999998),
 ('34', 5330.8),
 ('35', 5155.419999999999),
 ('36', 4278.049999999997),
 ('37', 4735.200000000002),
 ('38', 4898.460000000002),
 ('39', 6193.109999999999),
 ('4', 4815.050000000002),
 ('40', 5186.429999999999),
 ('41', 5637.62),
 ('42', 5696.840000000003),
 ('43', 5368.83),
 ('44', 4756.8899999999985),
 ('4

In [13]:
customerDollars = customerDollars.map(lambda x: (x[1], x[0])).sortByKey()
customerDollars.collect()

[(3309.38, '45'),
 (3790.570000000001, '79'),
 (3924.230000000001, '96'),
 (4042.6499999999987, '23'),
 (4172.289999999998, '99'),
 (4178.500000000001, '75'),
 (4278.049999999997, '36'),
 (4297.260000000001, '98'),
 (4316.299999999999, '47'),
 (4327.729999999999, '77'),
 (4367.62, '13'),
 (4384.33, '48'),
 (4394.599999999999, '49'),
 (4475.569999999999, '94'),
 (4505.79, '67'),
 (4517.27, '50'),
 (4524.509999999999, '78'),
 (4561.069999999999, '5'),
 (4628.4, '57'),
 (4635.799999999997, '83'),
 (4642.259999999999, '91'),
 (4647.129999999999, '74'),
 (4652.939999999999, '84'),
 (4659.63, '3'),
 (4664.589999999998, '12'),
 (4681.919999999999, '66'),
 (4701.019999999999, '56'),
 (4707.41, '21'),
 (4727.860000000001, '80'),
 (4735.030000000001, '14'),
 (4735.200000000002, '37'),
 (4755.070000000001, '7'),
 (4756.8899999999985, '44'),
 (4765.05, '31'),
 (4812.489999999998, '82'),
 (4815.050000000002, '4'),
 (4819.700000000001, '10'),
 (4830.549999999999, '88'),
 (4836.859999999999, '20'),
 

In [16]:
customerDollars = customerDollars.map(lambda x: (x[1], x[0]))
customerDollars.collect()

[('45', 3309.38),
 ('79', 3790.570000000001),
 ('96', 3924.230000000001),
 ('23', 4042.6499999999987),
 ('99', 4172.289999999998),
 ('75', 4178.500000000001),
 ('36', 4278.049999999997),
 ('98', 4297.260000000001),
 ('47', 4316.299999999999),
 ('77', 4327.729999999999),
 ('13', 4367.62),
 ('48', 4384.33),
 ('49', 4394.599999999999),
 ('94', 4475.569999999999),
 ('67', 4505.79),
 ('50', 4517.27),
 ('78', 4524.509999999999),
 ('5', 4561.069999999999),
 ('57', 4628.4),
 ('83', 4635.799999999997),
 ('91', 4642.259999999999),
 ('74', 4647.129999999999),
 ('84', 4652.939999999999),
 ('3', 4659.63),
 ('12', 4664.589999999998),
 ('66', 4681.919999999999),
 ('56', 4701.019999999999),
 ('21', 4707.41),
 ('80', 4727.860000000001),
 ('14', 4735.030000000001),
 ('37', 4735.200000000002),
 ('7', 4755.070000000001),
 ('44', 4756.8899999999985),
 ('31', 4765.05),
 ('82', 4812.489999999998),
 ('4', 4815.050000000002),
 ('10', 4819.700000000001),
 ('88', 4830.549999999999),
 ('20', 4836.859999999999),
 