In [1]:
# ! python3 -m grpc_tools.protoc -I=. --python_out=. animals.proto

In [2]:
from animals_pb2 import *

In [3]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer

In [4]:
broker = "localhost:9092"

In [5]:
admin = KafkaAdminClient(bootstrap_servers=[broker])

In [6]:
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError

In [7]:
try:
    admin.create_topics([NewTopic("animals", 4, 1)]) # protobufs
except TopicAlreadyExistsError:
    pass
try:
    admin.create_topics([NewTopic("animals-json", 4, 1)]) # json
except TopicAlreadyExistsError:
    pass

In [8]:
producer = KafkaProducer(bootstrap_servers=[broker])

In [9]:
key = "A"
value = Sighting(beach=key, animal="shark").SerializeToString()
producer.send("animals", value, bytes(key, "utf-8"))

<kafka.producer.future.FutureRecordMetadata at 0x7f08fc140520>

In [10]:
import time, random, threading

def animal_producer():
    while True:
        beach = random.choice(list("ABCDEFGHI"))
        animal = random.choice(["shark", "dolphin", "turtle", "seagull"])

        value = Sighting(beach=beach, animal=animal).SerializeToString()
        producer.send("animals", value, bytes(beach, "utf-8"))
        time.sleep(1)
threading.Thread(target=animal_producer).start()

# Streaming Group BY

In [11]:
from threading import Thread, Lock

lock = Lock()
def Print(*args):
    with lock:
        print(*args)

Print("hi")

hi


In [12]:
from kafka import TopicPartition

In [13]:
def beach_consumer(parts=[]):
    counts = {}  # key=beach, value=count
    partitions = [TopicPartition("animals", p) for p in parts]
    print(partitions)
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign(partitions)
    consumer.seek_to_beginning()
    for i in range(10):
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                s = Sighting.FromString(msg.value)
                if not s.beach in counts:
                    counts[s.beach] = 0
                counts[s.beach] += 1
        Print(parts, counts)

threading.Thread(target=beach_consumer, args=([0,1],)).start()
threading.Thread(target=beach_consumer, args=([2,3],)).start()

[TopicPartition(topic='animals', partition=0), TopicPartition(topic='animals', partition=1)]
[TopicPartition(topic='animals', partition=2), TopicPartition(topic='animals', partition=3)]


# Another Group BY, but not by the key

In [14]:
def animal_consumer(parts=[]):
    counts = {}  # key=animal, value=count
    partitions = [TopicPartition("animals", p) for p in parts]
    print(partitions)
    consumer = KafkaConsumer(bootstrap_servers=[broker])
    consumer.assign(partitions)
    consumer.seek_to_beginning()
    for i in range(10):
        batch = consumer.poll(1000)
        for tp, messages in batch.items():
            for msg in messages:
                s = Sighting.FromString(msg.value)
                if not s.animal in counts:
                    counts[s.animal] = 0
                counts[s.animal] += 1
        Print(parts, counts)

threading.Thread(target=animal_consumer, args=([0,1],)).start()
threading.Thread(target=animal_consumer, args=([2,3],)).start()

[TopicPartition(topic='animals', partition=0), TopicPartition(topic='animals', partition=1)]
[TopicPartition(topic='animals', partition=2), TopicPartition(topic='animals', partition=3)]


# Spark Streaming Demos

In [15]:
# Spark session (with Kafka jar)
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("demo")
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0')
         .config("spark.sql.shuffle.partitions", 10)
         .getOrCreate())

[0, 1] {'shark': 1}
[2, 3] {'shark': 1}
[0, 1] {'C': 1}
[2, 3] {'A': 1}
[0, 1] {'C': 1, 'B': 1}
[0, 1] {'shark': 1, 'seagull': 1}
[2, 3] {'shark': 1}
[2, 3] {'A': 1}
[0, 1] {'C': 1, 'B': 1}
[0, 1] {'shark': 1, 'seagull': 1}
[0, 1] {'C': 2, 'B': 1}
[0, 1] {'shark': 2, 'seagull': 1}
[2, 3] {'shark': 1}
[2, 3] {'A': 1}
[2, 3] {'shark': 1, 'seagull': 1}
[2, 3] {'A': 1, 'H': 1}
[0, 1] {'C': 2, 'B': 1}
[0, 1] {'shark': 2, 'seagull': 1}
[0, 1] {'C': 2, 'B': 1, 'D': 1}
[2, 3] {'shark': 1, 'seagull': 1}
[0, 1] {'shark': 3, 'seagull': 1}
[2, 3] {'A': 1, 'H': 1}
[0, 1] {'C': 2, 'B': 1, 'D': 1}
[0, 1] {'C': 2, 'B': 2, 'D': 1}
[2, 3] {'shark': 1, 'seagull': 1}
[0, 1] {'shark': 3, 'seagull': 1}
[0, 1] {'shark': 3, 'seagull': 1, 'turtle': 1}
[2, 3] {'A': 1, 'H': 1}
[0, 1] {'C': 3, 'B': 2, 'D': 1}
[0, 1] {'shark': 4, 'seagull': 1, 'turtle': 1}
[2, 3] {'shark': 1, 'seagull': 1}
[2, 3] {'A': 1, 'H': 1}
:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.j

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-56d064e8-4f20-48a5-9fc1-48b00e65181d;1.0
	confs: [default]


[2, 3] {'shark': 1, 'seagull': 1, 'turtle': 1}
[2, 3] {'A': 2, 'H': 1}
[2, 3] {'shark': 1, 'seagull': 1, 'turtle': 1}
[2, 3] {'A': 2, 'H': 1}


	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.5.0/spark-sql-kafka-0-10_2.12-3.5.0.jar ...
	[SUCCESSFUL ] org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0!spark-sql-kafka-0-10_2.12.jar (79ms)
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/3.5.0/spark-token-provider-kafka-0

In [16]:
import time, random, threading, json

def animal_json_producer():
    while True:
        beach = random.choice(list("ABCDEFGHI"))
        animal = random.choice(["shark", "dolphin", "turtle", "seagull"])

        #value = Sighting(beach=beach, animal=animal).SerializeToString()
        value = bytes(json.dumps({"beach": beach, "animal": animal}), "utf-8")
        producer.send("animals-json", value, bytes(beach, "utf-8"))
        time.sleep(1)
threading.Thread(target=animal_json_producer).start()

In [17]:
df = (
 spark.read.format("kafka")
 .option("kafka.bootstrap.servers", broker)
 .option("subscribe", "animals-json")
 .load()
)

In [18]:
df.dtypes

[('key', 'binary'),
 ('value', 'binary'),
 ('topic', 'string'),
 ('partition', 'int'),
 ('offset', 'bigint'),
 ('timestamp', 'timestamp'),
 ('timestampType', 'int')]

In [19]:
df.count()

23/11/22 15:37:08 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

15

In [20]:
df.limit(5).toPandas()

23/11/22 15:37:20 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
  if not is_datetime64tz_dtype(pser.dtype):
  if is_datetime64tz_dtype(s.dtype):


Unnamed: 0,key,value,topic,partition,offset,timestamp,timestampType
0,[67],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,0,2023-11-22 15:37:01.273,0
1,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,1,2023-11-22 15:37:03.285,0
2,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,2,2023-11-22 15:37:04.287,0
3,[66],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,3,2023-11-22 15:37:05.288,0
4,[67],"[123, 34, 98, 101, 97, 99, 104, 34, 58, 32, 34...",animals-json,0,4,2023-11-22 15:37:07.291,0


In [21]:
from pyspark.sql.functions import col, expr, from_json

In [22]:
schema = "beach string, animal string"
animals = (df
 .select(
     col("key").cast("string"),
     col("value").cast("string")
 )
 .select("key", from_json("value", schema).alias("value"))
 .select("key", "value.*")
)
animals.limit(5).toPandas()

23/11/22 15:37:22 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

Unnamed: 0,key,beach,animal
0,C,C,turtle
1,B,B,shark
2,B,B,seagull
3,B,B,seagull
4,C,C,dolphin


In [23]:
animals.isStreaming

False

# Let's make it a streaming DF

In [49]:
df = (
 spark.readStream.format("kafka")
 .option("kafka.bootstrap.servers", broker)
 .option("subscribe", "animals-json")
 .option("startingOffsets", "earliest")
 .load()
)

In [50]:
schema = "beach string, animal string"
animals = (df
 .select(
     col("key").cast("string"),
     col("value").cast("string")
 )
 .select("key", from_json("value", schema).alias("value"))
 .select("key", "value.*")
)

In [51]:
animals.isStreaming

True

In [52]:
# doesn't work for streaming
# animals.limit(5).toPandas()

In [53]:
# Spark streaming
# source => transformations => sink

# spark.readStream(????).?????.writeStream(????)

# Shark Alert Stream

In [29]:
q = (
 animals.filter("animal = 'shark'")
 .writeStream.format("console")
 .trigger(processingTime="5 seconds")
 .outputMode("append")
).start()
type(q)

23/11/22 15:37:24 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-2143b102-2161-4b08-a16d-60e633776e4d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/22 15:37:25 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


pyspark.sql.streaming.query.StreamingQuery

In [30]:
q.stop()
# spark.streams.active[0].stop()

# Streaming GROUP BY on animal

In [54]:
q = (
 animals.groupby("animal").count()
 .writeStream.format("console")
 .trigger(processingTime="5 seconds")
 .outputMode("complete")
).start()

23/11/22 15:59:19 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-9dc99422-d06d-4b78-a180-97519cec00d6. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/22 15:59:19 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/11/22 15:59:19 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark|  319|
|dolphin|  331|
|seagull|  346|
| turtle|  340|
+-------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark|  321|
|seagull|  346|
|dolphin|  331|
| turtle|  341|
+-------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark|  322|
|seagull|  347|
|dolphin|  331|
| turtle|  342|
+-------+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark|  324|
|seagull|  348|
|dolphin|  331|
| turtle|  344|
+-------+-----+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------+-----+
| animal|count|
+-------+-----+
|  shark|  325|
|seagull|  348|
|dolphin|  334|
| turtle|  345|
+-------+-----+



In [55]:
q.stop()