In [None]:
# https://github.com/confluentinc/confluent-kafka-python/tree/master/examples

In [1]:
import subprocess
from confluent_kafka.admin import AdminClient, NewTopic, ConfigResource
from confluent_kafka import Producer, Consumer, KafkaError, TopicCollection, KafkaException


In [None]:
# https://medium.com/@mrugankray/create-avro-producer-for-kafka-using-python-f9029d9b2802
# https://www.stackstalk.com/2022/08/avro-producer-consumer-python.html
# https://stackoverflow.com/questions/61509939/how-to-programatically-register-avro-schema-in-kafka-schema-registry-using-pytho
# https://betterprogramming.pub/avro-producer-with-python-and-confluent-kafka-library-4a1a2ed91a24

In [2]:
# Set up AdminClient for topic management
config = { 
    "bootstrap.servers": "localhost:9092"
}
admin_client = AdminClient(config)

In [None]:
def topic_exists(admin, topic):
    
    metadata = admin.list_topics()
    for t in iter(metadata.topics.values()):
        if t.topic == topic:
            return True
    return False

topic_name = 'greg'
if topic_exists(admin_client, topic_name):
    print(f"Topic '{topic_name}' exists.")

In [67]:
# create new topic and return results dictionary
def create_topic(admin, topic, partitions = 1, replicas=1):

    if topic_exists(admin, topic):
        print(f"Topic '{topic}' already exists.")
        return

    # Create a new topic with parameters
    new_topic = NewTopic(topic, num_partitions=partitions, replication_factor=replicas) 
    result_dict = admin.create_topics([new_topic])

    for topic, future in result_dict.items():
        try:
            future.result()  # The result itself is None
            #print("Topic {} created".format(topic))
            print(f"Topic '{topic}' created successfully.")
        except Exception as e:
            print("Failed to create topic {}: {}".format(topic, e))

# Create topic if it doesn't exist

topic_name = 'gregis'

if not topic_exists(admin_client, topic_name):
    create_topic(admin_client, topic_name)
else:
    print(f"Topic '{topic_name}' already exists.")


Topic 'gregis' created successfully.


In [None]:
from pprint import pprint

# Describe the New Kafka Topic
# get max.message.bytes property
def get_topic_config(admin, topic):
    resource = ConfigResource('topic', topic)
    result_dict = admin.describe_configs([resource])
    config_entries = result_dict[resource].result()
    pprint(config_entries)
    max_size = config_entries['max.message.bytes']
    return max_size.value

current_max = get_topic_config(admin_client, topic_name)

In [55]:

def delete_topic(admin_client, topic_name):
    """
    Delete a Kafka topic.(asynchronously)
    Parameters:
    - admin_client: Kafka AdminClient instance.
    - topic_name: Name of the topic to be deleted.
    """
    
    # Check if the topic exists
    if not topic_exists(admin_client, topic_name):
        print(f"Topic '{topic_name}' does not exist.")
        return

    # Delete the topic
    fs = admin_client.delete_topics([topic_name], operation_timeout=20.0)
    # Returns a dict of <topic,future>.

    # Wait for operation to finish.
    for topic, f in fs.items():
        try:
            f.result()  # The result itself is None
            print("Topic {} deleted".format(topic))
        except Exception as e:
            print("Failed to delete topic {}: {}".format(topic, e))


delete_topic(admin_client, topic_name)

Topic gregis deleted


In [None]:
from confluent_kafka.admin import AdminClient, NewTopic

# Create an AdminClient instance
admin_client = AdminClient({"bootstrap.servers": "localhost:9092"})

def describe_topics(a, topic_names):
    """
    Describe a Kafka topic and return information in a dictionary.

    Parameters:
    - admin_client: Kafka AdminClient instance.
    - topic_name: Name of the topic to be described.

    Returns:
    A dictionary containing information about the topic.
    """

    topics = TopicCollection(topic_names)
    futureMap = a.describe_topics(topics, request_timeout=10)

    for topic_name, future in futureMap.items():
        try:
            # Extract information from topic metadata
            print("Topic Information:")
            t = future.result()
            print("Topic name             : {}".format(t.name))
            print("Topic id               : {}".format(t.topic_id))
            print("Partitions             : {}".format(len(t.partitions)))
            #print(f"Partitions: {len(t.partitions)}")
            #print(f"Replication Factor: {t.replication_factor}")
            print("Partition Information")
            for partition in t.partitions:
                #print("    Id                : {}".format(partition.id))
                leader = partition.leader
                print("    Replicas          : {}".format(len(partition.replicas)))
                print(f"    Leader            : {leader}")
            print("")

            
        except Exception as e:
            print("Error while describing topic '{}': {}".format(topic_name, e))
            raise

topic_name = [ 'greg']
describe_topics(admin_client, topic_name)



In [3]:
# Producer Delivery Callback
def callback_report(err, event):
    """
    Reports the success or failure of a message delivery.
    Args:
        err (KafkaError): The error that occurred on None on success.
        msg (Message): The message that was produced or failed.
    """
    if err:
        print(f'Produce to topic {event.topic()} failed for event: {event.key()}')
        return
    else:
        max_length = 10

        val = event.value().decode('utf8')[:max_length] 
        key = event.key().decode('utf8')
        print(f'Messge: {val}, with Key: {key} produced to Topic:{event.topic()}  Part[{event.partition()}] at offset {event.offset()}.')


In [None]:
from faker import Faker
import random

# random Poem generator/iterantor 
def poem_generator():
    fake = Faker()
    
    poem_types = ['Haiku', 'Sonnet', 'FreeVerse', 'Limerick']
    
    while True:
        poem_type = random.choice(poem_types)
        poem_message = fake.sentence(nb_words=random.randint(5, 15))
        
        yield {
            'type': poem_type,
            'message': poem_message
        }

# Example usage:
if __name__ == "__main__":
    poems = poem_generator()

    for _ in range(5):  # Generate and print 5 poems
        poem = next(poems)
        print(f"Type: {poem['type']}, Poem: {poem['message']}")

In [5]:
def write_to_topic(producer, topic_name, message, key):
    """
    Produce a message to a specified Kafka topic using a provided Kafka producer instance.
    Parameters:
    - producer (confluent_kafka.Producer): The Kafka producer instance.
    - topic_name (str): The name of the Kafka topic to which the message will be sent.
    - message (str): The message content to be sent to the Kafka topic.
    - key (str): The key associated with the Kafka message.
    Returns: None
    Usage Example:
    write_to_topic(producer, "example_topic", "Hello, Kafka!", key="example_key")
    """
    try:
        # Produce a message to the specified topic
        producer.produce(topic=topic_name, value=message, key=key, callback=callback_report)
        producer.flush()
    except Exception as e:
        # Handle exceptions raised during message production
        print(f"Error during message production: {e}")

In [None]:
from confluent_kafka import Producer, Consumer, KafkaError, TopicCollection, KafkaException
import time

# Create Producer instance
# Set up Producer for producing messages
producer = Producer({"bootstrap.servers": "localhost:9092"})

# Kafka topic name
topic_name = "greg"

# Poem generator
poems = poem_generator()

try:
    while True:
        poem = next(poems)
        print(f"Type: {poem['type']}, Poem: {poem['message']}")
        
        # Write to Kafka topic
        write_to_topic(producer, topic_name, poem['message'], key=poem['type'])
        
        # Introduce a delay between 1 to 4 seconds
        time.sleep(random.uniform(1, 4))

except KeyboardInterrupt:
    # Handle Ctrl+C gracefully
    print("Stopping Kafka producer.")
finally:
    # Close the Kafka producer
    producer.flush()
    producer.close()


In [7]:
# Assign topic partitions to consumer
def assignment_callback(consumer, partitions):
    """ 
    Callback function invoked when a Kafka consumer is assigned to partitions.
    Parameters:
    - consumer (confluent_kafka.Consumer): The Kafka consumer instance.
    - partitions (list): List of assigned partitions.
    Returns: None
    """
    for p in partitions:
        print(f'Assigned to {p.topic}, partition {p.partition}')

In [None]:
# Set up Consumer for consuming messages
consumer = Consumer({
        "bootstrap.servers": "localhost:9092",
        "group.id": "my-group",
        "auto.offset.reset": "earliest"
    })
topic_to_read = 'greg'
consumer.subscribe([topic_to_read], on_assign=assignment_callback)

try:
    while True:
        event = consumer.poll(1.0)
        if event is None:
            continue
        if event.error():
            raise KafkaException(event.error())
        else:
            val = event.value().decode('utf8')
            key = event.key().decode('utf8')
            partition = event.partition()
            offset = event.offset()
            print(f'Received: {val} from partition [{partition}] at offset {offset}     ')
            # consumer.commit(event)
except KeyboardInterrupt:
    print('Canceled by user.')
finally:
    consumer.close()

In [5]:
import json

def serialize_poem(poem_dict):
    """
    Serialize a poem dictionary to a JSON-formatted string.

    Parameters:
    - poem_dict (dict): The dictionary representing a poem.

    Returns:
    str: JSON-formatted string.
    """
    return json.dumps(poem_dict)

def deserialize_poem(json_string):
    """
    Deserialize a JSON-formatted string to a poem dictionary.

    Parameters:
    - json_string (str): JSON-formatted string.

    Returns:
    dict: Dictionary representing a poem.
    """
    return json.loads(json_string)

# Example usage:
poem = {
    'type': 'Sonnet',
    'message': 'Shall I compare thee to a summer\'s day?'
}

# Serialize the poem dictionary to a JSON-formatted string
serialized_poem = serialize_poem(poem)
print("Serialized Poem:", serialized_poem)

# Deserialize the JSON-formatted string back to a poem dictionary
deserialized_poem = deserialize_poem(serialized_poem)
print("Deserialized Poem:", deserialized_poem)


Serialized Poem: {"type": "Sonnet", "message": "Shall I compare thee to a summer's day?"}
Deserialized Poem: {'type': 'Sonnet', 'message': "Shall I compare thee to a summer's day?"}


In [None]:
''' 
In Kafka, serialization and deserialization are crucial for converting data between the internal binary format used by Kafka and the native format of your data. In the case of Python and Kafka, you often need to serialize your data before sending it to Kafka and deserialize it after receiving it. For simplicity, I'll use JSON as the serialization format.

'''

In [16]:
# BASIC SERIALIZATION using json.dumps(poem)

import json

def serialize_poem(poem_dict):
    """
    Serialize a poem dictionary to a JSON-formatted string.

    Parameters:
    - poem_dict (dict): The dictionary representing a poem.

    Returns:
    str: JSON-formatted string.
    """
    return json.dumps(poem_dict)

# Kafka Procucer

from confluent_kafka import Producer
import json

# Configure Kafka Producer
producer_config = {'bootstrap.servers': 'localhost:9092'}
producer = Producer(producer_config)

# Poem dictionary
poem = {
    'type': 'Sonnet',
    'message': 'Shall I compare thee to a summer\'s day?'
}

# Kafka topic name
topic_name = "greg"

# Serialize and produce the message using JSON 
serialized_poem = serialize_poem(poem)
#serialized_poem = json.dumps(poem)

producer.produce(topic_name, key=poem['type'], value=serialized_poem, callback=callback_report)
producer.flush()


Messge: {"type": ", with Key: Sonnet produced to Topic:greg  Part[0] at offset 105.


0

In [17]:
# BASIC DE-SERIALIZATION using json.dumps(poem)

def deserialize_poem(json_string):
    """
    Deserialize a JSON-formatted string to a poem dictionary.

    Parameters:
    - json_string (str): JSON-formatted string.

    Returns:
    dict: Dictionary representing a poem.
    """
    return json.loads(json_string)

# Assign topic partitions to consumer
def assignment_callback(consumer, partitions):
    """ 
    Callback function invoked when a Kafka consumer is assigned to partitions.
    Parameters:
    - consumer (confluent_kafka.Consumer): The Kafka consumer instance.
    - partitions (list): List of assigned partitions.
    Returns: None
    """
    for p in partitions:
        print(f'Assigned to {p.topic}, partition {p.partition}')

# Kafka Consumer

from confluent_kafka import Consumer, KafkaError
import json

# Configure Kafka Consumer
consumer_config = {
        "bootstrap.servers": "localhost:9092",
        "group.id": "my-group",
        "auto.offset.reset": "earliest"
    }
consumer = Consumer(consumer_config)

# Kafka topic name
topic_name = "greg"

# Subscribe to the topic
consumer.subscribe([topic_name], on_assign=assignment_callback) 

# Poll for messages
while True:
    try: 
        msg = consumer.poll(1.0)  # Adjust the timeout as needed

        if msg is None:
            continue
        if msg.error():
            raise KafkaException(msg.error())
        
        else:
        # Deserialize the message using JSON
            try:
                #deserialized_poem = json.loads(msg.value().decode('utf-8'))
                deserialized_poem =  deserialize_poem(msg.value().decode('utf-8'))
                print("Received Poem: {}".format(deserialized_poem))
            except json.JSONDecodeError as e:
                print("Error decoding JSON: {}".format(e))
    except KeyboardInterrupt:
        break

# Close the consumer
consumer.close()


Assigned to greg, partition 0
Received Poem: {'type': 'Sonnet', 'message': "Shall I compare thee to a summer's day?"}
Received Poem: {'type': 'Sonnet', 'message': "Shall I compare thee to a summer's day?"}


In [None]:
# SERIALIZATION using AVRO 

from avro import schema
from avro.io import DatumWriter, BinaryEncoder
from io import BytesIO

# Poem dictionary
poem_data = {
    'type': 'Sonnet',
    'message': 'Shall I compare thee to a summer\'s day?'
}
"""
Avro Schema for Poem 
{
  "type": "record",
  "name": "Poem",
  "fields": [
    {"name": "type", "type": "string"},
    {"name": "message", "type": "string"}
  ]
}
"""

# Avro schema for the Poem
poem_avro_schema_str = """
{
  "type": "record",
  "name": "Poem",
  "fields": [
    {"name": "type", "type": "string"},
    {"name": "message", "type": "string"}
  ]
}
"""

poem_avro_schema = schema.parse(poem_avro_schema_str)

# Avro serialization

# Create a new BytesIO object named bytes_io, which will be used to hold the Avro-encoded data during serialization.
bytes_io = BytesIO()
# Create Object with the structure for the serialization process.
writer = DatumWriter(poem_avro_schema)
# Module that provides functionality to encode data in binary format according to the Avro specification.
encoder = BinaryEncoder(bytes_io)
# Perform the actual serialization.
writer.write(poem_data, encoder)
# Retrieve the content of the BytesIO object, which now contains the Avro-encoded binary data produced during serialization.
avro_serialized_data = bytes_io.getvalue()

print("Avro Serialized Data:")
print(avro_serialized_data)

# DE-SERIALIZATION using AVRO

from avro.io import DatumReader, BinaryDecoder


poem_avro_schema = schema.parse(poem_avro_schema_str)

# Avro deserialization
bytes_io = BytesIO(avro_serialized_data)
reader = DatumReader(poem_avro_schema)
decoder = BinaryDecoder(bytes_io)
poem_deserialized_data = reader.read(decoder)

print("\nAvro Deserialized Data:")
print(poem_deserialized_data)


""" 

BytesIO is a class in the io module that allows you to treat a bytes-like object / in-memory byte buffer (in this case, avro_serialized_data) as a file-like object.

DatumReader is a class from the avro.io module that reads Avro-encoded data.
DatumWriter is a class from the avro.io module that is responsible for writing Avro-encoded data.

Both take the Avro schema (poem_avro_schema) as an argument. The schema is necessary for serialization/deserialization to interpret the binary data correctly.

BinaryEncoder is another class from the avro.io module that provides functionality to encode data in binary format according to the Avro specification.

BinaryEncoder is created to write the Avro-encoded binary data to the BytesIO object.

The writer.write(poem_data, encoder) operation serializes the data.
The reader.read(decoder) operation reads the binary data, interprets it based on the provided schema, and produces the deserialized data in the form of a Python dictionary 

bytes_io.getvalue() retrieves the content of the BytesIO object, which now contains the Avro-encoded binary data produced during serialization.

"""

In [22]:
# SERIALIZATION using AVRO without Schema Registry

from confluent_kafka import Producer
import avro.schema
from avro.io import DatumWriter, BinaryEncoder
from io import BytesIO

from avro.io import DatumReader, BinaryDecoder
from io import BytesIO

# Kafka Procucer

# Configure Kafka Producer
producer_config = {'bootstrap.servers': 'localhost:9092'}
producer = Producer(producer_config)

# Poem dictionary
poem_data = {
    'type': 'Sonnet',
    'message': 'Shall I compare thee to a summer\'s day?'
}

# Avro schema for the Poem
poem_schema_str = """
{
  "type": "record",
  "name": "Poem",
  "fields": [
    {"name": "type", "type": "string"},
    {"name": "message", "type": "string"}
  ]
}
"""
poem_avro_schema = avro.schema.parse(poem_schema_str)


# Avro serialization
bytes_io = BytesIO()
writer = DatumWriter(poem_avro_schema)
encoder = BinaryEncoder(bytes_io)
writer.write(poem_data, encoder)
avro_serialized_data = bytes_io.getvalue()

print(f"Avro Serialized Data: {bytes_io.getvalue()}")

# Kafka topic name
topic_name = "gavro"

# Produce to Kafka
producer.produce(topic=topic_name, key=poem['type'], value=avro_serialized_data, callback=callback_report)

# Wait for any outstanding messages to be delivered and delivery reports to be received
producer.flush()


Avro Serialized Data: b"\x0cSonnetNShall I compare thee to a summer's day?"
Messge: SonnetNSh, with Key: Sonnet produced to Topic:gavro  Part[1] at offset 0.


0

In [23]:
# DE-SERIALIZATION using AVRO without Schema Registry

# Assign topic partitions to consumer
def assignment_callback(consumer, partitions):
    """ 
    Callback function invoked when a Kafka consumer is assigned to partitions.
    Parameters:
    - consumer (confluent_kafka.Consumer): The Kafka consumer instance.
    - partitions (list): List of assigned partitions.
    Returns: None
    """
    for p in partitions:
        print(f'Assigned to {p.topic}, partition {p.partition}')



from confluent_kafka import Consumer, KafkaException
from avro.io import DatumReader, BinaryDecoder
from io import BytesIO

# Kafka Consumer Configuration
consumer_config = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'avro-consumer',
    'auto.offset.reset': 'earliest'
}

# Poem dictionary
poem_data = {
    'type': 'Sonnet',
    'message': 'Shall I compare thee to a summer\'s day?'
}

# Avro schema for the Poem
poem_schema_str = """
{
  "type": "record",
  "name": "Poem",
  "fields": [
    {"name": "type", "type": "string"},
    {"name": "message", "type": "string"}
  ]
}
"""
poem_avro_schema = avro.schema.parse(poem_schema_str)


# Create Kafka Consumer
consumer = Consumer(consumer_config)

# Kafka topic name
topic_name = "gavro"

# Subscribe to the topic
consumer.subscribe([topic_name], on_assign=assignment_callback) 

# Avro deserialization
def deserialize_avro(data, schema):
    reader = DatumReader(schema)
    bytes_io = BytesIO(data)
    decoder = BinaryDecoder(bytes_io)
    return reader.read(decoder)


# Consume messages
try:
    while True:
        msg = consumer.poll(1.0)  # Adjust the timeout as requird

        if msg is None:
            continue
        if msg.error():
            if msg.error().code() == KafkaException._PARTITION_EOF:
                # End of partition event
                print("Reached end of partition, exiting consumer loop.")
                break
            else:
                print("Error: {}".format(msg.error()))
                continue

        # Avro deserialization
        avro_deserialized_data = deserialize_avro(msg.value(), poem_avro_schema)
        print("Received Poem: {}".format(avro_deserialized_data))

except KeyboardInterrupt:
    print("Consumer interrupted.")
finally:
    # Close the consumer
    consumer.close()

Assigned to gavro, partition 0
Assigned to gavro, partition 1
Received Poem: {'type': 'Sonnet', 'message': "Shall I compare thee to a summer's day?"}
Consumer interrupted.


In [None]:
""" 
Schema Registry is a centralized service that manages and stores Avro schemas for Kafka producers and consumers. It ensures that producers and consumers in a Kafka ecosystem agree on a common data format, which is crucial for schema evolution and data compatibility.

The SchemaRegistryClient is used to communicate with the Schema Registry service. It is responsible for registering and retrieving Avro schemas, as well as performing schema-related operations.

When working with the Schema Registry, it's common to use Avro as the data serialization format. Avro is a binary serialization format that includes schema information with the data. The Schema Registry helps manage and store these Avro schemas.

The Schema Registry is typically part of the Confluent Platform.
Need to install the entire Confluent Platform, which includes the Schema Registry.

"""

In [26]:
# Define your custom from_dict function
def obj_to_dict(msg):
    """
    Custom function to convert the message object to a dictionary.

    Parameters:
    - msg: The message object to be converted.

    Returns:
    dict: The converted dictionary.
    """
    return msg

In [None]:
# JSON SERIALIZATION using JSONSerializer and Schema Registry

from confluent_kafka import Producer, Consumer, KafkaException
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.serialization import StringSerializer, SerializationContext, MessageField
from confluent_kafka.schema_registry.json_schema import JSONSerializer
import json
import avro.schema
import time

# Avro schema for the poem
poem_schema_avro_str = """
{
  "type": "record",
  "name": "Poem",
  "fields": [
    {"name": "type", "type": "string"},
    {"name": "message", "type": "string"}
  ]
}
"""

# Converting an Avro schema to a JSON schema involves transforming the Avro-specific syntax into a JSON schema format.

# Json schema for the poem
poem_schema_json_str = """
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "title": "Poem",
  "description": "Poem schema description",
  "properties": {
    "type": {
      "type": "string"
    },
    "message": {
      "type": "string"
    }
  },
  "required": ["type", "message"]
}
"""

# Configure Schema Registry
schema_registry_conf = {'url': 'http://localhost:8081'}
schema_registry_client = SchemaRegistryClient(schema_registry_conf)

# Configure Kafka Producer
producer_config = {'bootstrap.servers': 'localhost:9092'}
producer = Producer(producer_config)

# STR Serializer 
string_serializer = StringSerializer('utf_8')

# JSON Serializer for the Poem schema
json_serializer = JSONSerializer(poem_schema_json_str, schema_registry_client)
# Create a JSONSerializer instance with your schema
json_serializer = JSONSerializer(poem_schema_json_str, obj_to_dict)


# Kafka topic to which you want to send the serialized poem
topic_name = 'kreg'

# Poem generator
poems = poem_generator()

try:
    while True:
        poem = next(poems)
        print(f"Type: {poem['type']}, Poem: {poem['message']}")

        # Serialize the poem dictionary
        serialized_poem = json_serializer(poem, SerializationContext(topic_name, MessageField.VALUE))
                    

        # Produce the serialized poem to the Kafka topic
        # key = string_serializer(str(poem['type'])),
        producer.produce(topic=topic_name, value=serialized_poem, key=poem['type'], callback=callback_report)
    
        
        # Introduce a delay between 1 to 4 seconds
        time.sleep(random.uniform(1, 4))

except KeyboardInterrupt:
    # Handle Ctrl+C gracefully
    print("Stopping Kafka producer.")
finally:
    # Flush the producer to ensure all messages are sent
    producer.flush()
    # Close the Kafka producer
    producer.close()



In [None]:
# Example of how you might use the JSONSerializer to serialize a poem dictionary

# Assuming poem dictionary like this
poem = {
    'type': 'Sonnet',
    'message': 'Shall I compare thee to a summer\'s day?'
}

from confluent_kafka import Producer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.serialization import SerializationContext, MessageField
from confluent_kafka.schema_registry.json_schema import JSONSerializer
import json

# Define josn schema string
poem_schema_json_str = """
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "title": "Poem",
  "description": "Poem schema description",
  "properties": {
    "type": {
      "type": "string"
    },
    "message": {
      "type": "string"
    }
  },
  "required": ["type", "message"]
}
"""

# Create a SchemaRegistryClient instance
# if you are running it locally, it might be something like http://localhost:8081.
schema_registry_url = "http://localhost:8081"   
schema_registry_client = SchemaRegistryClient({"url": schema_registry_url})



# Create a JSONSerializer instance with your schema
json_serializer = JSONSerializer(poem_schema_json_str, obj_to_dict)

import time

# Create Producer instance
# Set up Producer for producing messages
producer = Producer({"bootstrap.servers": "localhost:9092"})

# Kafka topic to which you want to send the serialized poem
topic_name = 'kreg'

# Poem generator
poems = poem_generator()

try:
    while True:
        poem = next(poems)
        print(f"Type: {poem['type']}, Poem: {poem['message']}")

        # Serialize the poem dictionary
        serialized_poem = json_serializer(poem, SerializationContext(topic_name, MessageField.VALUE))
                    

        # Produce the serialized poem to the Kafka topic
        producer.produce(topic=topic_name, value=serialized_poem, key=poem['type'], callback=callback_report)
        
        # Introduce a delay between 1 to 4 seconds
        time.sleep(random.uniform(1, 4))

except KeyboardInterrupt:
    # Handle Ctrl+C gracefully
    print("Stopping Kafka producer.")
finally:
    # Flush the producer to ensure all messages are sent
    producer.flush()
    # Close the Kafka producer
    producer.close()

In [23]:
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.json_schema import JSONDeserializer
from confluent_kafka.serialization import SerializationContext, MessageField
import json

# Define your schema string (replace with your actual schema)
schema_str = '{"type":"record",\
                "name":"Poem",\
                "fields":[\
                    {"name":"type","type":"string"},\
                    {"name":"message","type":"string"}\
                ] }'

# Define your custom from_dict function
def dict_to_obj(msg):
    return msg

# Create a JSONDeserializer instance with your schema and from_dict function
json_deserializer = JSONDeserializer(schema_str, from_dict=dict_to_obj)

# Create a DeserializingConsumer instance
# Create a SchemaRegistryClient instance (replace schema registry URL)
schema_registry_url = "http://localhost:8081"
schema_registry_client = SchemaRegistryClient({"url": schema_registry_url})

# Set up Consumer for consuming messages
consumer = Consumer({
        "bootstrap.servers": "localhost:9092",
        "group.id": "my-group",
        "auto.offset.reset": "earliest"
    })
topic_to_read = 'greg'
consumer.subscribe([topic_to_read], on_assign=assignment_callback)

while True:
        try:
            event = consumer.poll(1.0)
            if event is None:
                continue
            temp = json_deserializer(event.value(), SerializationContext(topic_to_read, MessageField.VALUE))
            if temp is not None:
                print(f'Latest temp in {temp.city} is {temp.reading} {temp.unit}.')

        except KeyboardInterrupt:
            break

consumer.close()

Assigned to greg, partition 0


In [None]:

from pyspark.sql import SparkSession
from pyspark.sql.avro import AvroSchema
from pyspark.sql.functions import from_avro

# Create SparkSession
spark = SparkSession.builder.appName("AvroConsumer").getOrCreate()

# Avro schema for the Poem (reuse from the producer example)
poem_avro_schema_str = """
{
  "type": "record",
  "name": "Poem",
  "fields": [
    {"name": "type", "type": "string"},
    {"name": "message", "type": "string"}
  ]
}
"""
poem_avro_schema = AvroSchema(poem_avro_schema_str)

In [None]:
# Read from Kafka topic in Avro format

# options
#  Use earliest to read all available data from the beginning
# Option to handle data loss due to Kafka topic deletion

df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "avro_poem_topic") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()


# Deserialize Avro data
df = df.selectExpr("CAST(key AS STRING)", "from_avro(value, '{}') AS poem".format(poem_avro_schema_str))

# Extract fields from the Avro structure
df = df.select("poem.type", "poem.message")

# Output to the console (you can modify this based on your use case)
query = df.writeStream.outputMode("append").format("console").start()

# Await termination (useful in a real application)
query.awaitTermination()

# Stop SparkSession
spark.stop()

In [34]:
import json

def serialize_poem(poem_dict):
    """
    Serialize a poem dictionary to a JSON-formatted string.

    Parameters:
    - poem_dict (dict): The dictionary representing a poem.

    Returns:
    str: JSON-formatted string.
    """
    return json.dumps(poem_dict)

def deserialize_poem(json_string):
    """
    Deserialize a JSON-formatted string to a poem dictionary.

    Parameters:
    - json_string (str): JSON-formatted string.

    Returns:
    dict: Dictionary representing a poem.
    """
    return json.loads(json_string)

# Producer Delivery Callback
def callback_report(err, event):
    """
    Reports the success or failure of a message delivery.
    Args:
        err (KafkaError): The error that occurred on None on success.
        msg (Message): The message that was produced or failed.
    """
    if err:
        print(f'Produce to topic {event.topic()} failed for event: {event.key()}')
        return
    else:
        max_length = 10

        val = event.value().decode('utf8')[:max_length] 
        key = event.key().decode('utf8')
        print(f'Messge: {val}, with Key: {key} produced to Topic:{event.topic()}  Part[{event.partition()}] at offset {event.offset()}.')

# Assign topic partitions to consumer
def assignment_callback(consumer, partitions):
    """ 
    Callback function invoked when a Kafka consumer is assigned to partitions.
    Parameters:
    - consumer (confluent_kafka.Consumer): The Kafka consumer instance.
    - partitions (list): List of assigned partitions.
    Returns: None
    """
    for p in partitions:
        print(f'Assigned to {p.topic}, partition {p.partition}')

import avro.schema
from avro.io import DatumReader, DatumWriter, BinaryEncoder
from io import BytesIO

# Avro serialization
def serialize_avro(data, schema):
    bytes_io = BytesIO()
    writer = DatumWriter(schema)
    encoder = BinaryEncoder(bytes_io)
    writer.write(data, encoder)
    avro_serialized_data = bytes_io.getvalue()
    # print(f"Avro Serialized Data: {bytes_io.getvalue()}")
    return avro_serialized_data


# Avro deserialization
def deserialize_avro(data, schema):
    reader = DatumReader(schema)
    bytes_io = BytesIO(data)
    decoder = BinaryDecoder(bytes_io)
    return reader.read(decoder)


In [None]:
from confluent_kafka import Producer
import json
import avro

# topic_name = input("Enter the topic name to write: ")

# Kafka topic name
topic_name = "gavro" 
topic_name = "gjson" 

# Poem generator
poems = poem_generator()

# Set up Producer for producing messages
producer_config = {
                "bootstrap.servers": "localhost:9092"
}
producer = Producer(producer_config)


# BASIC SERIALIZATION using json.dumps(poem)

# Kafka Procucer

from confluent_kafka import Producer
import json

# Configure Kafka Producer
producer_config = {'bootstrap.servers': 'localhost:9092'}
producer = Producer(producer_config)

# Poem dictionary
poem = {
    'type': 'Sonnet',
    'message': 'Shall I compare thee to a summer\'s day?'
}

# Kafka topic name
topic_name = "greg"

# Serialize and produce the message using JSON 
serialized_poem = serialize_poem(poem)
#serialized_poem = json.dumps(poem)

producer.produce(topic_name, key=poem['type'], value=serialized_poem, callback=callback_report)
producer.flush()


In [35]:

def produce_function(topic, serialization_option, data):
        
    # Check if the topic is empty and assign a default value if necessary
    # Strip leading and trailing whitespace, assign a default value if empty

    if serialization_option == 'n':
             
        if not topic:
            topic_deafult = 'gnone'
            topic = topic_deafult
            key = data["type"]
            value = data["message"]

    elif serialization_option == 's':
             
        if not topic:
            topic_deafult = 'gjson'
            topic = topic_deafult
            key = serialize_poem(data['key'])
            value = serialize_poem(data)
            
    elif serialization_option == 'a':
 
        if not topic:
            topic_deafult = 'gavro'
            topic = topic_deafult

            # Avro schema for the Poem
            poem_schema_str = """
            {
            "type": "record",
            "name": "Poem",
            "fields": [
                {"name": "type", "type": "string"},
                {"name": "message", "type": "string"}
            ]
            }
            """
            poem_avro_schema = avro.schema.parse(poem_schema_str)

            key = serialize_poem(data)
            value = serialize_avro(data, poem_avro_schema)

    else:
        topic_default = 'None'
        print("Error")
        return None, None, None
    
    return topic, key, value

# Get user input for serialisation method
serialization_option = input("Choose serialization option:\nn - None\ns - JSON\na - Avro\nEnter serialization option: ")

# Get user input for the topic name , strip leading and trailing whitespace
topic_name = input("Enter the topic name (Enter for Default): ").strip()

# Poem generator
poems = poem_generator()
poem = next(poems)
print(f"Type: {poem['type']}, Poem: {poem['message']}")

topic, key, value = produce_function(topic_name, serialization_option, poem)
# Your function logic here
print(f"Processing topic: {topic}")
print(f"Processing key: {key}")
print(f"Processing value: {value}")


Type: Limerick, Poem: Seem early ground history catch prepare.
Processing topic: gavro
Processing key: "Limerick"
Processing value: b'\x10LimerickPSeem early ground history catch prepare.'


In [None]:
def produce_de_function(serialization_option, data):
        
    # Check if the topic is empty and assign a default value if necessary
    # Strip leading and trailing whitespace, assign a default value if empty

    if serialization_option == 'n':
        key = data["type"]
        value = data["message"]

    elif serialization_option == 's':
        key = deserialize_poem(data['type'])
        value = deserialize_poem(data)
            
    elif serialization_option == 'a':
        # Avro schema for the Poem
        poem_schema_str = """
            {
            "type": "record",
            "name": "Poem",
            "fields": [
                {"name": "type", "type": "string"},
                {"name": "message", "type": "string"}
            ]
            }
            """
        poem_avro_schema = avro.schema.parse(poem_schema_str)

        key = deserialize_poem(data["type"])
        value = deserialize_avro(data, poem_avro_schema)

    else:
        topic_default = 'None'
        print("Error")
        return None, None
    
    return key, value

# Get user input for deserialisation method
deserialization_option = input("Choose de-serialization option:\nn - None\ns - JSON\na - Avro\nEnter serialization option: ")

# Get user input for the topic name , strip leading and trailing whitespace
topic_name = input("Enter the topic name (Enter for Default): ").strip()


key, value = produce_de_function(serialization_option, event)
# Your function logic here
print(f"Processing topic: {topic}")
print(f"Processing key: {key}")
print(f"Processing value: {value}")

In [None]:
def produce_message(producer, topic, key, value, serialization):

    if serialization == 'n':
        producer.produce(topic, key=key, value=value)

    elif serialization == 's':
        serialized_value = json.dumps(value)
        producer.produce(topic, key=key, value=serialized_value, value_serializer=lambda v: str(v).encode('utf-8'))
        
    elif serialization == 'a':
        schema = avro.schema.Parse(open('your_avro_schema_file.avsc', 'r').read())
        writer = DataFileWriter(open('data.avro', 'wb'), DatumWriter(), schema)
        writer.append(value)
        writer.close()
        avro_bytes = open('data.avro', 'rb').read()
        producer.produce(topic, key=key, value=avro_bytes, value_serializer=lambda v: v)