In [1]:
from pyspark import SparkContext, SparkConf, RDD
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pykafka import KafkaClient
import json

# Create a local StreamingContext with two working thread and batch interval of 1 second
#sc = SparkContext("spark://server1:7077", "pyspark")


conf = SparkConf().setAppName("Sales Order Unified Profile Update").setMaster("spark://server1:7077")
conf.set("spark.cores.max", 9)
conf.set("spark.cassandra.connection.host", "server4")

sc = SparkContext(conf=conf)

streamingContext = StreamingContext(sc, 10)

In [2]:
from cassandra.cluster import Cluster
from cassandra.query import BatchStatement

class SimpleClient(object):
    session = None
    createProfileStatement = None
    updateSalesOrderStatement = None
    
    batch = None
    
    session = None

    def connect(self):
        if(self.session is None):
            cluster = Cluster(['server4.bigdata.ibm.com','server5.bigdata.ibm.com','server6.bigdata.ibm.com'])
            self.session = cluster.connect()
        return self.session
        #the key space is test now
        #self.session.execute("use test")
        #self.createProfileStatement = self.session.prepare("insert into customer_product(customer, product, date) values(?,?,?) if not exists")
        #self.updateSalesOrderStatement = self.session.prepare("update customer_product set orders[?] = {qty: ?, unitAmt: ?} where customer = ? and product = ? and date = ?")

    def close(self):
        print 'close'
        #self.session.cluster.shutdown()
        
    def createProfile(self, args):
        #self.batch.add(self.createProfileStatement, args)
        self.session.execute(self.createProfileStatement, args)
    
    def updateSalesOrder(self, args):
        #self.batch.add(self.updateSalesOrderStatement, args)
        self.session.execute(self.updateSalesOrderStatement, args)
    
    def execute(self, statement, parameters=[]):
        preparedStatement = self.session.prepare(statement)
        return self.session.execute(preparedStatement, parameters)

In [3]:
#change localhost:2181 to server7 or zookeeper on server1?
stream = KafkaUtils.createStream(streamingContext, 'server1.bigdata.ibm.com:2181', "raw-data-to-profile-update", {'event' :3})

def processEvent(client, event):
    #print event
    customer = event['Customer']
    product = event['Product']
    date = event['XactionDate'][0:10]
    time = event['XactionDate'][11:19]
    qty = event['Qty']
    unitAmt = event['UnitAmt']
    #insert row if not exist
    #print "inserting profile"
    #client.createProfile([customer, product, date])
    client.execute("insert into test.customer_product(customer, product, date) values(?,?,?) if not exists", [customer, product, date])
    #insert sales order
    #print "inerting sales order"
    client.execute("update test.customer_product set orders[?] = {qty: ?, unitAmt: ?} where customer = ? and product = ? and date = ?",
                  [time, qty, unitAmt, customer, product, date])
    #client.updateSalesOrder([time, qty, unitAmt, customer, product, date])
    #send to profile update service
    profile = {'customer': customer, 'product': product, 'date': date}
    
    result = client.execute("select * from test.customer_product where customer = ? and product = ? and date = ?", [customer, product, date])
    row = result[0]
    profile['LTDSalesCount'] = 0
    profile['LTDSalesQty'] = 0
    profile['LTDSalesAmt'] = 0.0
    profile['AvgSalesAmt'] = 0.0
    profile['AvgSalesQty'] = 0.0
    
    for key in row.orders.iterkeys():
        value = row.orders.get(key)
        profile['LTDSalesCount'] += 1
        profile['LTDSalesQty'] += value.qty
        profile['LTDSalesAmt'] += value.qty * value.unitamt
        
    if(profile['LTDSalesQty'] > 0.0 and profile['LTDSalesCount'] > 0):
        profile['AvgSalesAmt'] = profile['LTDSalesAmt'] / profile['LTDSalesQty']
        profile['AvgSalesQty'] = float(profile['LTDSalesQty']) / profile['LTDSalesCount']
    
        #TODO: calcualate score
        profile['Score'] = 1
    
        client.execute("""update test.customer_product 
            set LTDSalesAmt = ?, LTDSalesQty = ?, LTDSalesCount = ?, AvgSalesAmt = ?, AvgSalesQty = ?
            where customer = ? and product = ? and date = ?""",
                    [profile['LTDSalesAmt'], profile['LTDSalesQty'], profile['LTDSalesCount'], 
                    profile['AvgSalesAmt'], profile['AvgSalesQty'], 
                    profile['customer'], profile['product'], profile['date']])
    

    return profile

def processPartition(iter):
    try:
        #cassandra connection
        client = SimpleClient()
        client.connect()
        
        i = 0
        for record in iter:
            event = processEvent(client, json.loads(record[1]))
            i += 1
    finally:
        client.close()
        print '######## persist ' + str(i) + ' events ,last event is ' +  str(event)

def processRDD(rdd):
    rdd.foreachPartition(processPartition)
    
stream.foreachRDD(processRDD)

In [None]:
streamingContext.start()             # Start the computation
streamingContext.awaitTermination()  # Wait for the computation to terminate