In [1]:
# ! python3 -m grpc_tools.protoc -I=. --python_out=. animals.proto

In [2]:
from animals_pb2 import *

In [3]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer

In [4]:
broker = "localhost:9092"

In [5]:
admin = KafkaAdminClient(bootstrap_servers=[broker])

In [9]:
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError

In [10]:
try:
    admin.create_topics([NewTopic("animals", 4, 1)]) # protobufs
except TopicAlreadyExistsError:
    pass
try:
    admin.create_topics([NewTopic("animals-json", 4, 1)]) # json
except TopicAlreadyExistsError:
    pass

In [11]:
producer = KafkaProducer(bootstrap_servers=[broker])

In [12]:
key = "A"
value = Sighting(beach=key, animal="shark").SerializeToString()
producer.send("animals", value, bytes(key, "utf-8"))

<kafka.producer.future.FutureRecordMetadata at 0x7f4698134d00>

In [13]:
import time, random, threading

def animal_producer():
    while True:
        beach = random.choice(list("ABCDEFGHI"))
        animal = random.choice(["shark", "dolphin", "turtle", "seagull"])

        value = Sighting(beach=beach, animal=animal).SerializeToString()
        producer.send("animals", value, bytes(beach, "utf-8"))
        time.sleep(1)
threading.Thread(target=animal_producer).start()

# Streaming Group BY

In [21]:
from threading import Thread, Lock

lock = Lock()
def Print(*args):
    with lock:
        print(*args)

Print("hi")

hi


In [14]:
from kafka import TopicPartition

In [23]:
def beach_consumer(parts=[]):
    counts = {}  # key=beach, value=count
    partitions = [TopicPartition("animals", p) for p in parts]
    print(partitions)
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign(partitions)
    consumer.seek_to_beginning()
    for i in range(10):
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                s = Sighting.FromString(msg.value)
                if not s.beach in counts:
                    counts[s.beach] = 0
                counts[s.beach] += 1
        Print(parts, counts)

threading.Thread(target=beach_consumer, args=([0,1],)).start()
threading.Thread(target=beach_consumer, args=([2,3],)).start()

[TopicPartition(topic='animals', partition=0), TopicPartition(topic='animals', partition=1)]
[TopicPartition(topic='animals', partition=2), TopicPartition(topic='animals', partition=3)]
[0, 1] {'I': 64, 'C': 46, 'B': 50, 'D': 64, 'E': 46}
[2, 3] {'A': 41, 'F': 50, 'G': 48, 'H': 57}
[2, 3] {'A': 41, 'F': 51, 'G': 48, 'H': 57}
[0, 1] {'I': 64, 'C': 46, 'B': 50, 'D': 64, 'E': 46}
[0, 1] {'I': 64, 'C': 46, 'B': 50, 'D': 65, 'E': 46}
[2, 3] {'A': 41, 'F': 51, 'G': 48, 'H': 57}
[0, 1] {'I': 64, 'C': 46, 'B': 50, 'D': 65, 'E': 46}
[2, 3] {'A': 41, 'F': 51, 'G': 48, 'H': 57}
[0, 1] {'I': 65, 'C': 46, 'B': 50, 'D': 65, 'E': 46}
[2, 3] {'A': 41, 'F': 51, 'G': 49, 'H': 57}
[0, 1] {'I': 65, 'C': 46, 'B': 50, 'D': 65, 'E': 46}
[0, 1] {'I': 65, 'C': 46, 'B': 51, 'D': 65, 'E': 46}
[2, 3] {'A': 41, 'F': 51, 'G': 49, 'H': 57}
[2, 3] {'A': 42, 'F': 51, 'G': 49, 'H': 57}
[0, 1] {'I': 65, 'C': 46, 'B': 51, 'D': 65, 'E': 46}
[2, 3] {'A': 42, 'F': 52, 'G': 49, 'H': 57}
[0, 1] {'I': 65, 'C': 46, 'B': 51, 'D'

# Another Group BY, but not by the key

In [24]:
def animal_consumer(parts=[]):
    counts = {}  # key=animal, value=count
    partitions = [TopicPartition("animals", p) for p in parts]
    print(partitions)
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign(partitions)
    consumer.seek_to_beginning()
    for i in range(10):
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                s = Sighting.FromString(msg.value)
                if not s.animal in counts:
                    counts[s.animal] = 0
                counts[s.animal] += 1
        Print(parts, counts)

threading.Thread(target=animal_consumer, args=([0,1],)).start()
threading.Thread(target=animal_consumer, args=([2,3],)).start()

[TopicPartition(topic='animals', partition=0), TopicPartition(topic='animals', partition=1)]
[TopicPartition(topic='animals', partition=2), TopicPartition(topic='animals', partition=3)]
[0, 1] {'shark': 87, 'turtle': 73, 'dolphin': 77, 'seagull': 84}
[2, 3] {'shark': 74, 'turtle': 59, 'dolphin': 58, 'seagull': 52}
[0, 1] {'shark': 87, 'turtle': 73, 'dolphin': 77, 'seagull': 85}
[2, 3] {'shark': 74, 'turtle': 59, 'dolphin': 58, 'seagull': 52}
[0, 1] {'shark': 87, 'turtle': 73, 'dolphin': 77, 'seagull': 86}
[2, 3] {'shark': 74, 'turtle': 59, 'dolphin': 58, 'seagull': 52}
[0, 1] {'shark': 88, 'turtle': 73, 'dolphin': 77, 'seagull': 86}
[2, 3] {'shark': 74, 'turtle': 59, 'dolphin': 58, 'seagull': 52}
[0, 1] {'shark': 88, 'turtle': 73, 'dolphin': 78, 'seagull': 86}
[2, 3] {'shark': 74, 'turtle': 59, 'dolphin': 58, 'seagull': 52}
[2, 3] {'shark': 74, 'turtle': 60, 'dolphin': 58, 'seagull': 52}
[0, 1] {'shark': 88, 'turtle': 73, 'dolphin': 78, 'seagull': 86}
[2, 3] {'shark': 74, 'turtle': 60,

# Spark Streaming Demos

In [65]:
# Spark session (with Kafka jar)
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("demo")
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0')
         .config("spark.sql.shuffle.partitions", 10)
         .getOrCreate())

In [26]:
import time, random, threading, json

def animal_json_producer():
    while True:
        beach = random.choice(list("ABCDEFGHI"))
        animal = random.choice(["shark", "dolphin", "turtle", "seagull"])

        #value = Sighting(beach=beach, animal=animal).SerializeToString()
        value = bytes(json.dumps({"beach": beach, "animal": animal}), "utf-8")
        producer.send("animals-json", value, bytes(beach, "utf-8"))
        time.sleep(1)
threading.Thread(target=animal_json_producer).start()

In [27]:
df = (
 spark.read.format("kafka")
 .option("kafka.bootstrap.servers", broker)
 .option("subscribe", "animals-json")
 .load()
)

In [29]:
df.dtypes

[('key', 'binary'),
 ('value', 'binary'),
 ('topic', 'string'),
 ('partition', 'int'),
 ('offset', 'bigint'),
 ('timestamp', 'timestamp'),
 ('timestampType', 'int')]

In [31]:
df.count()

23/11/20 16:30:32 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

210

In [33]:
df.limit(5).toPandas()

23/11/20 16:31:06 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):


Unnamed: 0,key,value,topic,partition,offset,timestamp,timestampType
0,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,0,2023-11-20 16:27:03.722,0
1,[67],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,1,2023-11-20 16:27:05.726,0
2,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,2,2023-11-20 16:27:07.728,0
3,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,3,2023-11-20 16:27:09.731,0
4,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,4,2023-11-20 16:27:10.733,0


In [34]:
from pyspark.sql.functions import col, expr, from_json

In [46]:
schema = "beach string, animal string"
animals = (df
 .select(
     col("key").cast("string"),
     col("value").cast("string")
 )
 .select("key", from_json("value", schema).alias("value"))
 .select("key", "value.*")
)
animals.limit(5).toPandas()

23/11/20 16:35:27 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

Unnamed: 0,key,beach,animal
0,B,B,shark
1,C,C,seagull
2,B,B,seagull
3,B,B,turtle
4,B,B,seagull


In [48]:
animals.isStreaming

False

# Let's make it a streaming DF

In [49]:
df = (
 spark.readStream.format("kafka")
 .option("kafka.bootstrap.servers", broker)
 .option("subscribe", "animals-json")
 .load()
)

In [50]:
schema = "beach string, animal string"
animals = (df
 .select(
     col("key").cast("string"),
     col("value").cast("string")
 )
 .select("key", from_json("value", schema).alias("value"))
 .select("key", "value.*")
)

In [51]:
animals.isStreaming

True

In [53]:
# doesn't work for streaming
# animals.limit(5).toPandas()

In [None]:
# Spark streaming
# source => transformations => sink

# spark.readStream(????).?????.writeStream(????)

# Shark Alert Stream

In [56]:
q = (
 animals.filter("animal = 'shark'")
 .writeStream.format("console")
 .trigger(processingTime="5 seconds")
 .outputMode("append")
).start()
type(q)

23/11/20 16:40:18 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-e7532387-90c9-4810-a33c-46925f2d9897. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/20 16:40:18 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


pyspark.sql.streaming.query.StreamingQuery

23/11/20 16:40:18 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
+---+-----+------+





-------------------------------------------
Batch: 2
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  D|    D| shark|
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
+---+-----+------+

-------------------------------------------
Batch: 4
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  B|    B| shark|
|  C|    C| shark|
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  C|    C| shark|
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  G|    G| shark|
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  D|    D| shark|
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  G|    G| shark|
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  G|    G| shark|
|  G|    G| shark|
+---+-----+------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+---+-----+------+
|key|beach|animal|
+---+-----+------+
|  A|    A| shark|
+---+-----+------+



In [57]:
q.stop()
# spark.streams.active[0].stop()

# Streaming GROUP BY on animal

In [63]:
q = (
 animals.groupby("animal").count()
 .writeStream.format("console")
 .trigger(processingTime="5 seconds")
 .outputMode("complete")
).start()

23/11/20 16:44:10 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-abcb8a15-83e8-4f4b-ab7f-7196cb2b52c0. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/20 16:44:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/11/20 16:44:10 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
23/11/20 16:44:38 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 27328 milliseconds


-------------------------------------------
Batch: 0
-------------------------------------------
+------+-----+
|animal|count|
+------+-----+
+------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|seagull|    5|
|  shark|    4|
| turtle|    8|
|dolphin|   10|
+-------+-----+



23/11/20 16:45:02 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 24781 milliseconds