# Chapter 6

In [1]:
from pyspark.streaming import StreamingContext

In [2]:
batchDurationInSecs = 5
ssc = StreamingContext(sc, batchDuration=batchDurationInSecs)

#### Monitors the given dir and reads each newly created file
[Ref: textFileStream](http://spark.apache.org/docs/latest/api/python/pyspark.streaming.html#pyspark.streaming.StreamingContext.textFileStream)

In [3]:
# This will create a DStream (discretized stream), representing a sequence of RDDs,
# periodically created from the input stream.
fileStream = ssc.textFileStream('/home/deepakt/Documents/dev/spark/sia/sia-ch6/dataIn')

In [4]:
# Parse each line of DStream
from datetime import datetime
def parseOrder(line):
    f = line.split(',')
    if f[6] == 'B' or f[6] == 'S':
        return [{
            'time': datetime.strptime(f[0], '%Y-%m-%d %H:%M:%S'),
            'orderId': long(f[1]),
            'clientId': long(f[2]),
            'symbol': f[3],
            'amout': int(f[4]),
            'price': float(f[5]),
            'buy': f[6] == 'B'
        }]
    return []

In [5]:
ordersStream = fileStream.flatMap(parseOrder)

In [6]:
ordersStream.pprint(1)

In [7]:
# Count number of buys and sells by creating tuple and reduce
from operator import add
numPerType = ordersStream.map(lambda order: (order['buy'], 1L)).reduceByKey(add)

In [8]:
numPerType.pprint(2)

In [9]:
# Store it to file
numPerType.repartition(1).saveAsTextFiles(prefix='dataOut/output', suffix='txt')

In [10]:
ssc.start()

-------------------------------------------
Time: 2017-09-22 11:08:35
-------------------------------------------
{'orderId': 1L, 'buy': True, 'symbol': u'EPE', 'clientId': 80L, 'time': datetime.datetime(2016, 3, 22, 20, 25, 28), 'price': 51.0, 'amout': 710}
...

-------------------------------------------
Time: 2017-09-22 11:08:35
-------------------------------------------
(False, 4980L)
(True, 5020L)

-------------------------------------------
Time: 2017-09-22 11:08:40
-------------------------------------------
{'orderId': 10001L, 'buy': True, 'symbol': u'ABX', 'clientId': 58L, 'time': datetime.datetime(2016, 3, 22, 20, 25, 28), 'price': 6.0, 'amout': 63}
...

-------------------------------------------
Time: 2017-09-22 11:08:40
-------------------------------------------
(False, 4926L)
(True, 5074L)

-------------------------------------------
Time: 2017-09-22 11:08:45
-------------------------------------------
{'orderId': 30001L, 'buy': False, 'symbol': u'UAL', 'clientId': 5L, 

-------------------------------------------
Time: 2017-09-22 11:10:45
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:10:50
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:10:50
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:10:55
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:10:55
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:11:00
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:11:00
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:11:05
-------------------------------------------

-------------------------------------------
Time: 2017-09-22 11:11:05
----------

In [11]:
ssc.stop(False)

In [12]:
# Read the result
allCounts = sc.textFile("dataOut/output-*.txt/")

In [18]:
# Each line is a string; needs to converted to tuple
allCounts.take(10)

[u'(False, 5035L)',
 u'(True, 4965L)',
 u'(False, 4926L)',
 u'(True, 5074L)',
 u'(False, 10067L)',
 u'(True, 9933L)',
 u'(False, 4876L)',
 u'(True, 5124L)',
 u'(False, 9975L)',
 u'(True, 10025L)']

In [30]:
import ast
rdd_buysell = allCounts.map(lambda s: ast.literal_eval(s))

In [31]:
rdd_buysell.take(10)

[(False, 5035L),
 (True, 4965L),
 (False, 4926L),
 (True, 5074L),
 (False, 10067L),
 (True, 9933L),
 (False, 4876L),
 (True, 5124L),
 (False, 9975L),
 (True, 10025L)]

In [33]:
from operator import add
rdd_buysell.reduceByKey(add).collect()

[(False, 94529L), (True, 95471L)]