## 6.1.7 Saving the computation state over time

In [41]:
from pyspark.streaming import StreamingContext

In [42]:
batchDurationInSecs = 5
ssc = StreamingContext(sc, batchDuration=batchDurationInSecs)

#### Monitors the given dir and reads each newly created file
[Ref: textFileStream](http://spark.apache.org/docs/latest/api/python/pyspark.streaming.html#pyspark.streaming.StreamingContext.textFileStream)

In [43]:
# This will create a DStream (discretized stream), representing a sequence of RDDs,
# periodically created from the input stream.
fileStream = ssc.textFileStream('/home/deepakt/Documents/dev/spark/sia/sia-ch6/dataIn')

In [44]:
# Parse each line of DStream
from datetime import datetime
def parseOrder(line):
    f = line.split(',')
    if f[6] == 'B' or f[6] == 'S':
        return [{
            'time': datetime.strptime(f[0], '%Y-%m-%d %H:%M:%S'),
            'orderId': long(f[1]),
            'clientId': long(f[2]),
            'symbol': f[3],
            'amount': int(f[4]),
            'price': float(f[5]),
            'buy': f[6] == 'B'
        }]
    return []

In [45]:
ds_orders = fileStream.flatMap(parseOrder)

In [46]:
# Count number of buys and sells by creating tuple and reduce
from operator import add
ds_numPerType = ds_orders.map(lambda order: (order['buy'], 1L)).reduceByKey(add)

In [47]:
# DStream to calculate total amount per client
ds_amountPerClient = ds_orders.map(lambda order: (order['clientId'], order['amount'] * order['price']))

#### updateStateByKey(updateFunction)
updateFunction : Function to update state for each key
    
The first argument of this function is a Seq object with new values of a key that came in the current mini-batch. The second argument is the state value of the key, or None if the state for that key hasn’t been calculated yet. If the state for the key has been calculated, but no new values for the key were received in the current mini-batch, the first argument will be an empty Seq. The function should return the new value for the key’s state.

In [48]:
ds_amountState = ds_amountPerClient \
    .updateStateByKey(lambda l_amount, total_amount: sum(l_amount) + total_amount if total_amount != None else sum(l_amount))

In [49]:
ds_amountState.pprint(5)

In [50]:
# In each batch/rdd reverse sort by total_amount and pick top 5 clients
# transform() is use to work on each rdd in each DStream

ds_top5Clients = ds_amountState \
    .transform(lambda rdd: rdd.sortBy(lambda (clientId, amount): amount, ascending=False)
                                .map(lambda (clientId, amount): clientId) \
                                .zipWithIndex() \
                                .filter(lambda (clientId, i): i < 5) \
                                .map(lambda (clientId, i): clientId)
              )

In [51]:
ds_top5Clients.pprint(2)

### Start merging streams

In [52]:
# Add 'BUYS'/'SELLS' to ds_numPerType
ds_buySellMetric = ds_numPerType.map(lambda (bors, num): ('BUYS', [str(num)]) if bors else ('SELLS', [str(num)]))

In [53]:
# Add 'TOP5CLIENTS' to ds_top5Clients and convert list of clientIds to list of str(clientId)
ds_top5ClinetMetric = ds_top5Clients \
                        .map(lambda clientId: str(clientId)) \
                        .repartition(1) \
                        .glom() \
                        .map(lambda l_clientId: ('TOP5CLIENTS', l_clientId))

In [54]:
# Join the two streams
ds_final = ds_buySellMetric.union(ds_top5ClinetMetric)

In [55]:
# Store it to file
ds_final.repartition(1).saveAsTextFiles(prefix='dataOut/output', suffix='txt')

#### Checkpointing

updateStateByKeyexpands RDD’s DAG in each mini-batch, and that can quickly lead to stack overflow exceptions. By periodically checkpointing RDDs, their calculation plan’s dependence on previous mini-batches is broken.

In [56]:
sc.setCheckpointDir('.')

In [57]:
ssc.start()

-------------------------------------------
Time: 2017-09-22 14:20:25
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 14:20:25
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 14:20:30
-------------------------------------------
(96L, 2002749.0)
(64L, 2416163.0)
(16L, 2754024.0)
(32L, 2245536.0)
(8L, 3054717.0)
...

-------------------------------------------
Time: 2017-09-22 14:20:30
-------------------------------------------
34
21
...

-------------------------------------------
Time: 2017-09-22 14:20:35
-------------------------------------------
(96L, 4124313.0)
(64L, 6002732.0)
(48L, 4244893.0)
(32L, 5113304.0)
(8L, 5614207.0)
...

-------------------------------------------
Time: 2017-09-22 14:20:35
-------------------------------------------
34
69
...

-------------------------------------------
Time: 2017-09-22 14:20:40
-------------------------------------------


-------------------------------------------
Time: 2017-09-22 14:22:30
-------------------------------------------
(96L, 45695355.0)
(64L, 48928507.0)
(48L, 48819140.0)
(32L, 49356320.0)
(8L, 48336365.0)
...

-------------------------------------------
Time: 2017-09-22 14:22:30
-------------------------------------------
87
23
...

-------------------------------------------
Time: 2017-09-22 14:22:35
-------------------------------------------
(96L, 45695355.0)
(64L, 48928507.0)
(48L, 48819140.0)
(32L, 49356320.0)
(8L, 48336365.0)
...

-------------------------------------------
Time: 2017-09-22 14:22:35
-------------------------------------------
87
23
...

-------------------------------------------
Time: 2017-09-22 14:22:40
-------------------------------------------
(96L, 45695355.0)
(64L, 48928507.0)
(48L, 48819140.0)
(32L, 49356320.0)
(8L, 48336365.0)
...

-------------------------------------------
Time: 2017-09-22 14:22:40
-------------------------------------------
87
23
...

-

In [58]:
ssc.stop(False)

In [60]:
# Read the result
rdd_allMetrics = sc.textFile("dataOut/output-*.txt/")

In [61]:
# Each line is a string; needs to converted to tuple
rdd_allMetrics.take(10)

[u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])",
 u"('TOP5CLIENTS', ['87', '23', '70', '15', '10'])"]

In [None]:
ds_amountPerClient.m