# Learning PySpark 
### Video series

### Packt Publishing

**Author**: Tomasz Drabas
**Date**:   2017-12-10





# Section 3: Resilient Distributed Datasets - Actions

In this section we will look at the Resilient Distributed Datasets (RDDs) and the actions available.

## Read in the data

In [3]:
import datetime as dt

def parseCSVRow(inputRow):
    try:
        rowSplit = inputRow.split(',')
        rowSplit[0] = dt.datetime.strptime(rowSplit[0], '%m/%d/%y')
        rowSplit[4] = int(rowSplit[4])
        
        for i in [5,6]:
            rowSplit[i] = float(rowSplit[i])
        
        return [rowSplit]
    except:
        return []

rdd_clean = sc.textFile('../data/sample_data.csv', 4) \
    .flatMap(parseCSVRow)

## .take(...) action

In [4]:
rdd_clean.take(2)

[[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05], [datetime.datetime(2016, 2, 9, 0, 0), 'Central', 'Jardine', 'Pencil', 36, 4.99, 179.64]]

In [5]:
for element in rdd_clean.takeOrdered(5, key = lambda el: el[0]):
    print(element)

[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05]
[datetime.datetime(2016, 2, 9, 0, 0), 'Central', 'Jardine', 'Pencil', 36, 4.99, 179.64]
[datetime.datetime(2016, 2, 26, 0, 0), 'Central', 'Gill', 'Pen', 27, 19.99, 539.73]
[datetime.datetime(2016, 3, 15, 0, 0), 'West', 'Sorvino', 'Pencil', 56, 2.99, 167.44]
[datetime.datetime(2016, 4, 1, 0, 0), 'East', 'Jones', 'Binder', 60, 4.99, 299.4]

In [7]:
for element in rdd_clean.takeSample(False, 5, seed=666):
    print(element)

[datetime.datetime(2017, 8, 7, 0, 0), 'Central', 'Kivell', 'Pen Set', 42, 23.95, 1005.9]
[datetime.datetime(2016, 6, 25, 0, 0), 'Central', 'Morgan', 'Pencil', 90, 4.99, 449.1]
[datetime.datetime(2016, 9, 1, 0, 0), 'Central', 'Smith', 'Desk', 2, 125.0, 250.0]
[datetime.datetime(2017, 4, 10, 0, 0), 'Central', 'Andrews', 'Pencil', 66, 1.99, 131.34]
[datetime.datetime(2016, 5, 22, 0, 0), 'West', 'Thompson', 'Pencil', 32, 1.99, 63.68]

## .collect(...) action

In [8]:
len(rdd_clean.collect()) == rdd_clean.count()

True

In [10]:
for element in rdd_clean.filter(lambda el: el[-1] > 1000).collect():
    print(element)

[datetime.datetime(2016, 7, 29, 0, 0), 'East', 'Parent', 'Binder', 81, 19.99, 1619.19]
[datetime.datetime(2016, 12, 29, 0, 0), 'East', 'Parent', 'Pen Set', 74, 15.99, 1183.26]
[datetime.datetime(2017, 2, 1, 0, 0), 'Central', 'Smith', 'Binder', 87, 15.0, 1305.0]
[datetime.datetime(2017, 8, 7, 0, 0), 'Central', 'Kivell', 'Pen Set', 42, 23.95, 1005.9]
[datetime.datetime(2017, 10, 14, 0, 0), 'West', 'Thompson', 'Binder', 57, 19.99, 1139.43]
[datetime.datetime(2017, 12, 4, 0, 0), 'Central', 'Jardine', 'Binder', 94, 19.99, 1879.06]

## .reduce(...) action

In [13]:
from operator import add

total_value = rdd_clean \
    .map(lambda el: el[-1]) \
    .reduce(add)
    
total_value

18628.38

In [14]:
total_value = rdd_clean \
    .map(lambda el: el[-1]) \
    .reduce(lambda x, y: x + y)

total_value

18628.38

## .reduceByKey(...) action

In [18]:
sales_by_region = rdd_clean \
    .map(lambda el: (el[1], el[-1])) \
    .reduceByKey(lambda x, y: x + y)
    
for element in sales_by_region.collect():
    print(element)

('East', 6002.090000000001)
('Central', 10139.57)
('West', 2486.7200000000003)

## .count() action

In [20]:
rdd_clean.count()

42

In [22]:
rdd_clean.countApprox(10, confidence=0.9)

42

In [29]:
sales = rdd_clean \
    .map(lambda el: el[2])

sales.countApproxDistinct()

11

In [33]:
distinct_sales = sales.distinct()
distinct_sales.collect()

['Jardine', 'Gill', 'Smith', 'Howard', 'Thompson', 'Jones', 'Sorvino', 'Andrews', 'Morgan', 'Parent', 'Kivell']

In [32]:
sales.distinct().count()

11

## .foreach(...) action

In [36]:
distinct_sales.foreach(print)

## .aggregate(...) action

In [71]:
seqOp =  (lambda x, y: (x[0] + y,    x[1] + 1))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))

rdd_clean \
    .map(lambda el: el[-1]) \
    .aggregate((0.0,0), seqOp, combOp)

(18628.38, 42)

## .aggregateByKey(...) action

In [72]:
seqOp =  (lambda x, y: (x[0] + y[0], x[1] + 1))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))

for element in rdd_clean \
    .map(lambda el: (el[2], (el[-1], 1))) \
    .aggregateByKey((0.0, 0), seqOp, combOp) \
    .map(lambda el: (el[0], el[1][0], el[1][1], el[1][0] / el[1][1])) \
    .collect():
        print(element)

('Jardine', 2812.19, 5, 562.438)
('Gill', 1749.8700000000001, 5, 349.97400000000005)
('Smith', 1641.43, 3, 547.1433333333333)
('Howard', 536.75, 2, 268.375)
('Thompson', 1203.1100000000001, 2, 601.5550000000001)
('Jones', 2363.04, 8, 295.38)
('Sorvino', 1283.6100000000001, 4, 320.90250000000003)
('Andrews', 438.37, 4, 109.5925)
('Morgan', 1387.77, 3, 462.59)
('Parent', 3102.3, 3, 1034.1000000000001)
('Kivell', 2109.94, 3, 703.3133333333334)

## .coalesce(...) action

## .combineByKey(...) action

## .histogram(...) action

## Sorting data

### sortBy(...) action

### sortByKey(...) action

## Saving data

### .saveAsTextFile(...) action

### .saveAsPickleFile(...) action

## Descriptive Statistics

### .mean() action

### .stdev() action

### .max() action

### .min() action