In [1]:
# ! python3 -m grpc_tools.protoc -I=. --python_out=. animals.proto

In [2]:
from animals_pb2 import Sighting

In [3]:
s = Sighting(animal="shark", beach="A")
s

beach: "A"
animal: "shark"

In [4]:
s.SerializeToString()

b'\n\x01A\x12\x05shark'

In [5]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer

In [6]:
broker = "localhost:9092"

In [7]:
admin = KafkaAdminClient(bootstrap_servers=[broker])

In [8]:
from kafka.admin import NewTopic

In [9]:
from kafka.errors import TopicAlreadyExistsError

In [10]:
try:
    admin.create_topics([NewTopic("animals", 4, 1)])   # protobufs
except TopicAlreadyExistsError:
    pass
    
try:
    admin.create_topics([NewTopic("animals-json", 4, 1)])   # JSON
except TopicAlreadyExistsError:
    pass

In [11]:
import random, time, threading

def animal_gen():
    producer = KafkaProducer(bootstrap_servers=[broker])

    while True:
        beach = random.choice(list("ABCDEFGHI"))
        animal = random.choice(["shark", "dolphin", "turtle", "seagull"])
        s = Sighting(animal=animal, beach=beach)
        producer.send("animals", value=s.SerializeToString(), key=bytes(beach, "utf-8"))
        time.sleep(1)

threading.Thread(target=animal_gen).start()

# Streaming Group By (count occurences per beach)

In [12]:
from threading import Thread, Lock

lock = Lock()
def Print(*args):
    with lock:
        print(*args)

Print("hi")

hi


In [13]:
from kafka import TopicPartition

In [14]:
def beach_consumer(partitions=[]):
    counts = {}   # key=beach, value=count
    
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign([TopicPartition("animals", p) for p in partitions])
    consumer.seek_to_beginning()
    for i in range(10):      # TODO: loop forever
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                s = Sighting.FromString(msg.value)

                if not s.beach in counts:
                    counts[s.beach] = 0
                counts[s.beach] += 1
        Print(partitions, counts)
threading.Thread(target=beach_consumer, args=([0,1],)).start()
threading.Thread(target=beach_consumer, args=([2,3],)).start()

In [15]:
def animal_consumer(partitions=[]):
    counts = {}   # key=animal, value=count
    
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign([TopicPartition("animals", p) for p in partitions])
    consumer.seek_to_beginning()
    for i in range(10):      # TODO: loop forever
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                s = Sighting.FromString(msg.value)

                if not s.animal in counts:
                    counts[s.animal] = 0
                counts[s.animal] += 1
        Print(partitions, counts)
threading.Thread(target=animal_consumer, args=([0,1],)).start()
threading.Thread(target=animal_consumer, args=([2,3],)).start()

# Spark Streaming

In [16]:
import random, time, threading, json

def animal_gen_json():
    producer = KafkaProducer(bootstrap_servers=[broker])

    while True:
        beach = random.choice(list("ABCDEFGHI"))
        animal = random.choice(["shark", "dolphin", "turtle", "seagull"])

        value = bytes(json.dumps({"beach": beach, "animal": animal}), "utf-8")
        producer.send("animals-json", value=value, key=bytes(beach, "utf-8"))
        
        time.sleep(1)

threading.Thread(target=animal_gen_json).start()

In [17]:
# Spark session (with Kafka jar)
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("demo")
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0')
         .config("spark.sql.shuffle.partitions", 10)
         .getOrCreate())

[0, 1] {'shark': 129, 'dolphin': 132, 'seagull': 128, 'turtle': 111}
[0, 1] {'shark': 260, 'dolphin': 266, 'seagull': 253, 'turtle': 221}
[0, 1] {'shark': 405, 'dolphin': 380, 'seagull': 379, 'turtle': 336}
[0, 1] {'shark': 536, 'dolphin': 492, 'seagull': 510, 'turtle': 462}
[0, 1] {'shark': 650, 'dolphin': 619, 'seagull': 639, 'turtle': 592}
[2, 3] {'H': 260, 'G': 240}
[2, 3] {'H': 508, 'G': 492}
[2, 3] {'H': 736, 'G': 764}
[2, 3] {'H': 979, 'G': 1021}
[2, 3] {'H': 1229, 'G': 1271}
[2, 3] {'shark': 127, 'turtle': 128, 'dolphin': 125, 'seagull': 120}
[2, 3] {'shark': 226, 'turtle': 235, 'dolphin': 273, 'seagull': 266}
[2, 3] {'shark': 339, 'turtle': 366, 'dolphin': 401, 'seagull': 394}
[2, 3] {'shark': 456, 'turtle': 471, 'dolphin': 561, 'seagull': 512}
[2, 3] {'shark': 576, 'turtle': 597, 'dolphin': 684, 'seagull': 643}
[0, 1] {'C': 181, 'B': 155, 'I': 164}
[0, 1] {'C': 336, 'B': 321, 'I': 343}
[0, 1] {'C': 518, 'B': 476, 'I': 506}
[0, 1] {'C': 698, 'B': 633, 'I': 669}
[0, 1] {'C': 86

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-899f1142-cf3d-4da9-bd48-d51da8ff492c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 1440ms :: artifacts dl 58ms
	:: modules in u

In [18]:
df = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", broker)
    .option("subscribe", "animals-json")
    .load()
)

In [19]:
df.dtypes

[('key', 'binary'),
 ('value', 'binary'),
 ('topic', 'string'),
 ('partition', 'int'),
 ('offset', 'bigint'),
 ('timestamp', 'timestamp'),
 ('timestampType', 'int')]

In [20]:
df.limit(5).toPandas()

23/11/22 19:13:25 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):


Unnamed: 0,key,value,topic,partition,offset,timestamp,timestampType
0,[67],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,0,2023-11-22 15:37:01.273,0
1,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,1,2023-11-22 15:37:03.285,0
2,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,2,2023-11-22 15:37:04.287,0
3,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,3,2023-11-22 15:37:05.288,0
4,[67],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,4,2023-11-22 15:37:07.291,0


In [21]:
from pyspark.sql.functions import col, expr, from_json

In [22]:
schema = "beach string, animal string"

animals = (
    df
    .select(col("key").cast("string"), col("value").cast("string"))
    .select("key", from_json("value", schema).alias("value"))
    .select("key", "value.*")
)
animals

DataFrame[key: string, beach: string, animal: string]

In [23]:
animals.limit(5).toPandas()

23/11/22 19:13:33 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Unnamed: 0,key,beach,animal
0,C,C,turtle
1,B,B,shark
2,B,B,seagull
3,B,B,seagull
4,C,C,dolphin


In [24]:
animals.count()

23/11/22 19:13:34 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

13008

In [25]:
animals.isStreaming

False

# Streaming DataFrame

In [26]:
# source => transformations => sink
# streaming_query = spark.readStream(????).????.writeStream(????)

In [37]:
df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", broker)
    .option("subscribe", "animals-json")
    .option("startingOffsets", "earliest")
    .load()
)

In [38]:
df.isStreaming

True

In [39]:
schema = "beach string, animal string"

animals = (
    df
    .select(col("key").cast("string"), col("value").cast("string"))
    .select("key", from_json("value", schema).alias("value"))
    .select("key", "value.*")
)
animals

DataFrame[key: string, beach: string, animal: string]

In [40]:
# not supported for streaming
# animals.toPandas()

# Shark Alert App

In [31]:
streaming_query = (
    animals
    .filter("animal='shark'")
    .writeStream
    .format("console")
    .trigger(processingTime="5 seconds")
    .outputMode("append")
).start()
type(streaming_query)

23/11/22 19:13:37 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-29226a41-ec38-4538-8dcc-09eb664e7bf7. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/22 19:13:37 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


pyspark.sql.streaming.query.StreamingQuery

In [32]:
streaming_query.stop()
# spark.streams.active[0].stop()

# Animal Counter App

In [41]:
q = (
    animals.groupby("animal").count()
    .writeStream
    .format("console")
    .trigger(processingTime="5 seconds")
    .outputMode("complete")
).start()

23/11/22 19:23:10 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ce39832b-faf3-410a-a083-e919b5dc9e41. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/22 19:23:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/11/22 19:23:10 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark| 3505|
|dolphin| 3527|
|seagull| 3593|
| turtle| 3532|
+-------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark| 3507|
|seagull| 3593|
|dolphin| 3530|
| turtle| 3535|
+-------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark| 3510|
|seagull| 3594|
|dolphin| 3531|
| turtle| 3540|
+-------+-----+



In [42]:
q.stop()