## Kafka + Spark streaming

Building protocol buffers.

In [None]:
! python3 -m grpc_tools.protoc -I=. --python_out=. animals.proto

In [None]:
# protocol buffers import statement

In [None]:
s = 
s

In [None]:
# serialize to string


In [None]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError
from kafka import TopicPartition
import random
import time
import threading
from threading import Thread, Lock

In [None]:
lock = Lock()
def Print(*args):
    with lock:
        print(*args)

### Admin

In [None]:
broker = "localhost:9092"
admin = KafkaAdminClient(bootstrap_servers=[broker])

### Creating `animals` and `animals-json` topics

In [None]:
try:
    admin.create_topics(???)        # protobufs
except TopicAlreadyExistsError:
    print("Topic already exists")
    
try:
    admin.create_topics(???)   # JSON
except TopicAlreadyExistsError:
    print("Topic already exists")

### Producer

In [None]:
animals = ["shark", "dolphin", "turtle", "seagull", "whale"]
beaches = list("ABCDEFGHI")

def animal_gen():
    producer = KafkaProducer(bootstrap_servers=[broker])
    
    while True:
        beach = ???
        animal = ???
        s = ???
        
        producer.send("animals", value=???, key=???)
        time.sleep(1)

threading.Thread(target=animal_gen).start()

### Consumer

### Streaming Group By (count animal occurences per beach)

In [None]:
def beach_consumer(partitions=[]):
    counts = {}   # key=beach, value=count
    
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign(???)
    consumer.seek_to_beginning()
    
    for i in range(10):      # TODO: loop forever
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                # s = ???

                # counts dict update
                
        Print(partitions, counts)
        
threading.Thread(target=beach_consumer, args=([0, 1],)).start()
threading.Thread(target=beach_consumer, args=([2, 3],)).start()

In [None]:
def animal_consumer(partitions=[]):
    counts = {}   # key=animal, value=count
    
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign([TopicPartition("animals", p) for p in partitions])
    consumer.seek_to_beginning()
    for i in range(10):      # TODO: loop forever
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                s = Sighting.FromString(msg.value)

                if not s.animal in counts:
                    counts[s.animal] = 0
                counts[s.animal] += 1
        Print(partitions, counts)
threading.Thread(target=animal_consumer, args=([0, 1],)).start()
threading.Thread(target=animal_consumer, args=([2, 3],)).start()

**Observation:** now the count will get split across both the consumers. We need to do more work if we need summarization.

### Spark streaming

In [None]:
import json

In [None]:
def animal_gen_json():
    producer = KafkaProducer(bootstrap_servers=[broker])

    while True:
        beach = random.choice(beaches)
        animal = random.choice(animals)

        value = ???
        producer.send("animals-json", value=value, key=bytes(beach, "utf-8"))
        
        time.sleep(1)

threading.Thread(target=animal_gen_json).start()

In [None]:
# Spark session (with Kafka jar)
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("demo")
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0')
         #.config("spark.sql.shuffle.partitions", 10)
         .getOrCreate())

In [None]:
# data types
df

In [None]:
# first five rows of data


In [None]:
# spark import statement

In [None]:
bad_schema = "beach string, fish string"
schema = "beach string, animal string"

animals = (
    df
    .select(col("key").cast("string"), col("value").cast("string"))
    .select("key", from_json("value", schema).alias("value"))
    .select("key", "value.*")
)
animals

In [None]:
animals.limit(5).toPandas()

In [None]:
animals.count()

In [None]:
animals.isStreaming

### Streaming DataFrame

source => transformations => sink

```
# streaming_query = spark.readStream(????).????.writeStream(????)
```

In [None]:
df = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", broker)
    .option("subscribe", "animals-json")
    .load()
)

In [None]:
df.isStreaming

In [None]:
schema = "beach string, animal string"

animals = (
    
)
animals

In [None]:
# not supported for streaming
# animals.toPandas()

### Shark Alert Application

### How can we stop the stream?

Alternatively, we can use the variable that we used to save the streaming query.

### Animal Counter Application

In [None]:
animal_query = (
    animals.groupby("animal").count()
    .writeStream
    .format("console")
    .trigger(processingTime="5 seconds")
    .outputMode("append")
).start()

In [None]:
animal_query.stop()