In [1]:
from notebook_client.notebook_client import NotebookClient
nc = NotebookClient()
nc.initialize_connections()

## Populate Kafka topic with some data

In [2]:
from kafka import KafkaProducer
from lib.serializer import value_serializer

topic_name = 'my_test_topic'

def populate_test_topic():
    producer = KafkaProducer(bootstrap_servers='kafka:9092', value_serializer=value_serializer)
    for i in range(100):
        obj = {'x': i}
        producer.send(topic_name, obj)
    producer.flush()
    producer.close()
    print('done')
    
populate_test_topic()

done


## Write Spark job to file

In [5]:
%%writefile jobs/test_job.py

import json

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition


ENCODING = 'utf-8'

def value_deserializer(m):
    return json.loads(m.decode(ENCODING))

def my_reducer(a, b):
    value = a if type(a) == int else a['x']
    return value + b['x']

def handle_result(rdd):
    rdd.foreach(lambda record: print(record))
    print('sending...')

sc = SparkContext(appName='testApp')
ssc = StreamingContext(sc, 10)

topicAndPartition = TopicAndPartition('my_test_topic', 0) # topic: my_test_topic, partition: 0
fromOffsets = {topicAndPartition: 0} # start from offset: 0

kvs = KafkaUtils.createDirectStream(ssc, ['my_test_topic'], {"metadata.broker.list": 'kafka:9092'}, valueDecoder=value_deserializer, fromOffsets=fromOffsets)
lines = kvs.map(lambda x: x[1]) # first item in tuple is None
sums = lines.reduce(my_reducer)
sums.foreachRDD(handle_result)

ssc.start()
ssc.awaitTermination()

Overwriting jobs/test_job.py


In [None]:
## Execute job (and stop)

In [2]:
nc.start_job('jobs/test_job.py')

4062

In [5]:
nc.job_status(58)

'not found'

In [3]:
nc.stop_job(4062)

'stopped'