In [1]:
import pandas as pd
from kafka import KafkaProducer
from time import sleep
from json import dumps
from json import loads
import csv

Connect to Kafka

In [2]:
producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                         value_serializer=lambda x: 
                         dumps(x).encode('utf-8'))

Start Streaming

In [3]:
SLEEP = 3

def make_serializable(d):
    serializable_dict = {}
    for key, value in d.items():
        if isinstance(value, (list, dict, str, int, float, bool, type(None))):
            serializable_dict[key] = value
        else:
            serializable_dict[key] = str(value)  # Convert non-serializable value to string
    return serializable_dict

def publish_to_kafka(file_path, producer=producer, chunk_size=1000):
    """ 
    Send each row to kafka mysql topic
    
    Uncomment either the environment code if using .env file or 
    variable code to set directly.
    
    SLEEP variable can be adjusted as desired. 1 is the minimum recommended setting. 
    Setting to 0 will crash the Kafka server unless a more robust EC2 setup is created.
    """
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        while True:
            chunk = []
            for _ in range(chunk_size):
                try:
                    chunk.append(next(reader))
                except StopIteration:
                    break
            if not chunk:
                break
            for row in chunk:
                # Convert row to JSON
                row = make_serializable(row)
                json_data = row

                # Publish row to Kafka
                producer.send('mysql', value=json_data)
                producer.flush()
                sleep(SLEEP)
    producer.close()

In [4]:
publish_to_kafka("data/Iowa_Liquor_Sales.csv")

KeyboardInterrupt: 

View Topics

Kafka Docker

1. `docker exec -it kafka1 bash`
1. `cd /usr/bin`
1. `kafka-topics --list --bootstrap-server localhost:9092`
1. `kafka-console-consumer --bootstrap-server localhost:9092 --topic mysql --from-beginning`

Manually flush data when done

In [4]:
producer.flush()

In [8]:
from kafka import KafkaAdminClient
from kafka.admin import NewPartitions, NewTopic

def clear_kafka_topic(topic_name, bootstrap_servers):
    admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers)

    # Describe topic to get current offsets
    topic_metadata = admin_client.describe_topics(topic_names=[topic_name])
    partitions = topic_metadata[topic_name].partitions.values()

    # Set partition offsets to 0
    for partition in partitions:
        partition.offset = 0

    # Create new topic with same configuration and updated partition offsets
    new_topic = NewTopic(name=topic_name, num_partitions=len(partitions), replication_factor=1)
    new_topic.partitions = {partition.id: NewPartitions(partition.count, assignment=[0]) for partition in partitions}

    # Delete original topic
    admin_client.delete_topics([topic_name])

    # Recreate topic with updated offsets
    admin_client.create_topics([new_topic])

    admin_client.close()

# Example usage
clear_kafka_topic("mysql", ["kafka1:9092"])

NoBrokersAvailable: NoBrokersAvailable

Test

In [None]:
import pandas as pd

df = pd.read_csv("../data/Iowa_Liquor_Sales.csv", nrows=10)

print(df)

In [None]:
print(df.columns)

In [None]:
# sample output
'''
{'Invoice/Item Number': 'S29198800001', 'Date': '11/20/2015', 'Store Number': '2191', 'Store Name': 'Keokuk Spirits', 
'Address': '1013 MAIN', 'City': 'KEOKUK', 'Zip Code': '52632', 'Store Location': '1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)', 
'County Number': '56', 'County': 'Lee', 'Category': '', 'Category Name': '', 'Vendor Number': '255', 'Vendor Name': 'Wilson Daniels Ltd.', 
'Item Number': '297', 'Item Description': 'Templeton Rye w/Flask', 'Pack': '6', 'Bottle Volume (ml)': '750', 'State Bottle Cost': '$18.09', 
'State Bottle Retail': '$27.14', 'Bottles Sold': '6', 'Sale (Dollars)': '$162.84', 'Volume Sold (Liters)': '4.50', 'Volume Sold (Gallons)': '1.19'}
'''

In [None]:
with open("../data/Iowa_Liquor_Sales.csv", 'r') as file:
    reader = csv.DictReader(file)
    while True:
        chunk = []
        for _ in range(1000):
            try:
                chunk.append(next(reader))
            except StopIteration:
                break
        if not chunk:
            break
        for row in chunk:
            print(row)