# Transformations we will learn: 

- `map()` 
- `reduceByKey()`
- `filter()` 


## Load the `dataset`

In [34]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.txt'

## Create `SparkContext`

In [35]:
sc = spark.sparkContext

In [36]:
sc

## Create a `RDD`

In [37]:
rdd1 = sc.textFile(data_set)

In [38]:
rdd1

s3://fcc-spark-example/dataset/2023/orders.txt MapPartitionsRDD[25] at textFile at NativeMethodAccessorImpl.java:0

In [39]:
rdd1.take(10)

                                                                                

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']

## 1. Find the no. of orders based on the `status` 

We want something like 

(CLOSED, 1)

(PENDING_PAYMENT, 1)

(COMPLETE, 1)
..


In [40]:
rdd2 = rdd1.map(lambda line: (line.split(',')[-1], 1)) 

In [41]:
rdd2.take(10)

                                                                                

[('CLOSED', 1),
 ('PENDING_PAYMENT', 1),
 ('COMPLETE', 1),
 ('CLOSED', 1),
 ('COMPLETE', 1),
 ('COMPLETE', 1),
 ('COMPLETE', 1),
 ('PROCESSING', 1),
 ('PENDING_PAYMENT', 1),
 ('PENDING_PAYMENT', 1)]

In [42]:
# Next we want to aggregate 

rdd3 = rdd2.reduceByKey(lambda x, y: x + y)

In [43]:
rdd3.take(10)

                                                                                

[('CLOSED', 7556),
 ('CANCELED', 1428),
 ('PENDING_PAYMENT', 15030),
 ('COMPLETE', 22899),
 ('PROCESSING', 8274),
 ('PAYMENT_REVIEW', 729),
 ('PENDING', 7609),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558)]

In [46]:
# Lastly lets sort it based on count

final_rdd = rdd3.sortBy(lambda x: x[1], ascending=False)

                                                                                

In [47]:
final_rdd.take(10)

[('COMPLETE', 22899),
 ('PENDING_PAYMENT', 15030),
 ('PROCESSING', 8274),
 ('PENDING', 7609),
 ('CLOSED', 7556),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558),
 ('CANCELED', 1428),
 ('PAYMENT_REVIEW', 729)]

## 2. Find the `premium customers (Top 5 who placed the max number of orders)

In [49]:
rdd1 = sc.textFile(data_set)

In [50]:
rdd1.take(5)

                                                                                

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [59]:
rdd2 = rdd1.map(lambda x: (x.split(',')[2], 1))

In [60]:
rdd2.take(5)

                                                                                

[('11599', 1), ('256', 1), ('12111', 1), ('8827', 1), ('11318', 1)]

In [61]:
rdd3 = rdd2.reduceByKey(lambda x, y: x + 1)

In [62]:
rdd3.take(5)

                                                                                

[('256', 8), ('12111', 4), ('11318', 5), ('7130', 3), ('2911', 4)]

In [68]:
final_result = rdd3.sortBy(lambda x: x[1], ascending=False)

In [69]:
final_result.take(10)

[('5549', 11),
 ('2433', 11),
 ('12431', 11),
 ('6898', 10),
 ('4435', 10),
 ('3554', 10),
 ('5821', 10),
 ('7176', 10),
 ('5033', 10),
 ('4876', 10)]

## 3. Distinct count of customer who placed atleast one order 

In [70]:
rdd1 = sc.textFile(data_set)

In [90]:
rdd2 = rdd1.map(lambda x: (x.split(',')[2]))

In [91]:
rdd2.take(5)

                                                                                

['11599', '256', '12111', '8827', '11318']

In [92]:
final_result = rdd2.distinct()

In [93]:
final_result.count() # Distinct customers

12405

In [94]:
rdd2.count() # Total no. of orders 

68881

## 4. Which customer has maximum number of COMPLETE orders 

In [85]:
rdd1 = sc.textFile(data_set)

In [101]:
rdd2 = rdd1.map(lambda x: (x.split(',')[2], x.split(',')[3]))

In [102]:
rdd2.take(5)

                                                                                

[('11599', 'CLOSED'),
 ('256', 'PENDING_PAYMENT'),
 ('12111', 'COMPLETE'),
 ('8827', 'CLOSED'),
 ('11318', 'COMPLETE')]

In [104]:
rdd3 = rdd2.filter(lambda x: x[1] == 'COMPLETE')

In [105]:
rdd3.take(5)

[Stage 79:>                                                         (0 + 1) / 1]                                                                                

[('12111', 'COMPLETE'),
 ('11318', 'COMPLETE'),
 ('7130', 'COMPLETE'),
 ('4530', 'COMPLETE'),
 ('2568', 'COMPLETE')]

In [106]:
rdd3.count()

22899

In [108]:
rdd4 = rdd3.map(lambda x: (x[0], 1))

In [109]:
rdd4.take(5)

                                                                                

[('12111', 1), ('11318', 1), ('7130', 1), ('4530', 1), ('2568', 1)]

In [110]:
rdd5 = rdd4.reduceByKey(lambda x, y: x + y)

In [111]:
rdd5.take(5)

                                                                                

[('12111', 2), ('11318', 3), ('7130', 4), ('333', 4), ('656', 2)]

In [112]:
final_result = rdd5.sortBy(lambda x: x[1], ascending=False)

In [113]:
final_result.take(10)

[('9337', 10),
 ('7802', 9),
 ('3710', 9),
 ('749', 9),
 ('221', 8),
 ('5186', 8),
 ('2469', 8),
 ('7910', 8),
 ('5283', 8),
 ('11061', 8)]

# Summary 

- `map()` 

    - 100 ROWs =======> 1000 ROWs
    
    
- `reduceByKey()`

    - 100 ROWs =======> 10 ROWs (if there are 10 distinct Keys)
    
    
- `reduce()`

    - 100 ROWs =======> 1 ROW 
    
- `filter()` 

    - 100 ROWs =======> 0 <= No. of ROWs <= 100 (Depending on the filer )
    
