# Learning PySpark 
### Video series

### Packt Publishing

**Author**: Tomasz Drabas
**Date**:   2017-12-10





# Section 2: Resilient Distributed Datasets - Transformations

In this section we will look at the Resilient Distributed Datasets (RDDs) and the transformations available.

## Creating RDDs
### Parallelize a collection
#### 1000 random numbers

In [None]:
import numpy as np

random_numbers = sc.parallelize([np.random.rand() for _ in range(1000)], 4)
random_numbers

In [2]:
random_numbers.take(2)

#### Tuples

In [3]:
rdd_random_tuples = sc.parallelize([
      ("Mark",  43, "6'1\"")
    , ("Stella",23, "5'6\"")
    , ("Skye",   6, "3'11\"")
    , ("Albert", 1, "2'7\"")
])
rdd_random_tuples.take(2)

[('Mark', 43, '6\'1"'), ('Stella', 23, '5\'6"')]

### Reading from files
The `sample_data.csv` was created based on the data found here: http://www.contextures.com/xlSampleData01.html.

In [4]:
rdd_from_file = sc.textFile('../data/sample_data.csv', 4)
rdd_from_file.take(2)

['OrderDate,Region,Rep,Item,Units,UnitCost,Total', '1/6/16,East,Jones,Pencil,95,1.99,189.05']

## Schema of an RDD
### Structured data

In [5]:
rdd_valid_structured = sc.parallelize([
      ('Tom', 0)
    , ('Spark', 1)
])

### Unstructured data
Sample Linux log data, source: https://www.cyberciti.biz/faq/linux-log-files-location-and-how-do-i-view-logs-files/

In [6]:
rdd_valid_unstructured = sc.parallelize([
      ('Jul 17 22:04:25 router  dnsprobe[276]: dns query failed')
    , ('Jul 17 22:04:29 router last message repeated 2 times')  
    , ('Jul 17 22:04:29 router  dnsprobe[276]: Primary DNS server Is Down... Switching To Secondary DNS server')
    , ('Jul 17 22:05:08 router  dnsprobe[276]: Switching Back To Primary DNS server')
    , ('Jul 17 22:26:11 debian -- MARK --')
    , ('Jul 17 22:46:11 debian -- MARK --')
    , ('Jul 17 22:47:36 router  -- MARK --')
    , ('Jul 17 22:47:36 router  dnsprobe[276]: dns query failed')
])

### Heterogeneous data

In [7]:
rdd_valid_heterogeneous = sc.parallelize([
      ('Toll invoice number', 'TRE2321')
    , {'Last bill balance': 0.00, 'New bill balance': 23.92}
    , ['2017-11-12 12:32:34PM', '2017-11-14 1:32:56PM']
])

## Understanding lazy execution

In [8]:
names_only = rdd_valid_structured.map(lambda element: element[0])

In [9]:
names_only.collect()

['Tom', 'Spark']

## .map(...) transformation
### Using lambdas

In [10]:
def splitOnComma(inputString):
    return inputString.split(',')

for el in rdd_from_file.map(splitOnComma).take(2):
    print(el)

['OrderDate', 'Region', 'Rep', 'Item', 'Units', 'UnitCost', 'Total']
['1/6/16', 'East', 'Jones', 'Pencil', '95', '1.99', '189.05']

In [11]:
for el in rdd_from_file.map(lambda element: element.split(',')).take(2):
    print(el)

['OrderDate', 'Region', 'Rep', 'Item', 'Units', 'UnitCost', 'Total']
['1/6/16', 'East', 'Jones', 'Pencil', '95', '1.99', '189.05']

### Transforming data

In [15]:
def convertToFloat(inputString):
    try:
        return float(inputString)
    except:
        return -666

to_filter = rdd_from_file \
    .map(lambda element: element.split(',')) \
    .map(lambda element:
         [e for e in element[:4]] +
         [convertToFloat(e) for e in element[4:]]
    )
    
to_filter.take(2)

[['OrderDate', 'Region', 'Rep', 'Item', -666, -666, -666], ['1/6/16', 'East', 'Jones', 'Pencil', 95.0, 1.99, 189.05]]

## .filter(...) transformation

In [16]:
filtered = to_filter.filter(lambda element: element[4] != -666)
filtered.take(2)

[['1/6/16', 'East', 'Jones', 'Pencil', 95.0, 1.99, 189.05], ['2017-03-02', 'Central', 'Kivell', 'Binder', 50.0, 19.99, 999.5]]

## .flatMap(...) transformation
### .map to flatMap comparison

In [23]:
def mapExample(inputList):
    outputList = []
    
    for item in inputList:
        temp = item.copy()
        temp[1] *= 10
        
        outputList.append(temp)
    return outputList

def flatMapExample(inputList):
    outputList = []
    
    for item in inputList:
        temp = item.copy()
        temp[1] *= 10
        
        outputList += temp
        
    return outputList

sampleList = [
      [1, 3]
    , [2, 4]
    , [3, 5]
    , [4, 6]
]

print(mapExample(sampleList))
print(flatMapExample(sampleList))

[[1, 30], [2, 40], [3, 50], [4, 60]]
[1, 30, 2, 40, 3, 50, 4, 60]

### Filtering malformed records

In [17]:
import datetime as dt

def parseCSVRow(inputRow):
    try:
        rowSplit = inputRow.split(',')
        rowSplit[0] = dt.datetime.strptime(rowSplit[0], '%m/%d/%y')
        rowSplit[4] = int(rowSplit[4])
        
        for i in [5,6]:
            rowSplit[i] = float(rowSplit[i])
        
        return [rowSplit]
    except:
        return []

rdd_from_file.map(parseCSVRow).take(3)

[[], [[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05]], []]

In [18]:
rdd_from_file.map(parseCSVRow).filter(lambda element: len(element) > 0).take(3)

[[[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05]], [[datetime.datetime(2016, 2, 9, 0, 0), 'Central', 'Jardine', 'Pencil', 36, 4.99, 179.64]], [[datetime.datetime(2016, 2, 26, 0, 0), 'Central', 'Gill', 'Pen', 27, 19.99, 539.73]]]

In [19]:
rdd_from_file_clean = rdd_from_file.flatMap(parseCSVRow)
rdd_from_file_clean.take(3)

[[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05], [datetime.datetime(2016, 2, 9, 0, 0), 'Central', 'Jardine', 'Pencil', 36, 4.99, 179.64], [datetime.datetime(2016, 2, 26, 0, 0), 'Central', 'Gill', 'Pen', 27, 19.99, 539.73]]

## .distinct(...) transformation

In [None]:
items = rdd_from_file_clean.map(lambda element: element[3]).distinct()
items.collect()

In [None]:
rdd_from_file_clean.map(lambda element: element[2]).distinct().count()

## .sample(...) transformation

In [None]:
rdd_from_file_clean.count()

In [None]:
rdd_from_file_clean.sample(False, 0.2).count()

## .join(...) transformation

In [None]:
cities = sc.parallelize([
      ('East', 'Boston')
    , ('Central', 'Chicago')
    , ('West', 'Seattle')
])

rdd_from_file_clean \
    .map(lambda element: (element[1], element)) \
    .join(cities) \
    .map(lambda element: element[1][0] + [element[1][1]]) \
    .take(2)

## .repartition(...) transformation

In [None]:
rdd_from_file_clean.getNumPartitions()

In [None]:
rdd_from_file_repartitioned = rdd_from_file.repartition(2)
rdd_from_file_repartitioned.getNumPartitions()

In [None]:
redd_from_file_repartitioned_sorted = rdd_from_file_clean \
    .map(lambda element: (element[4], element)) \
    .repartitionAndSortWithinPartitions(2, lambda x: x) \
    .map(lambda element: tuple(element[1]))

redd_from_file_repartitioned_sorted.glom() \
    .collect()

In [None]:
redd_from_file_repartitioned_sorted.getNumPartitions()