In [3]:
import requests
import pandas as pd
from kafka import KafkaProducer
from time import sleep
from json import dumps
import csv

Connect to Kafka

In [None]:
producer = KafkaProducer(bootstrap_servers=['100.25.219.224:9092'],
                         value_serializer=lambda x: 
                         dumps(x).encode('utf-8'))

Start Streaming

In [None]:
# import os
# MY_SQL_API_URL=os.getenv('MY_SQL_API_URL')
# or
# MY_SQL_API_URL='http://your-MySQL-api-url/endpoint'
SLEEP=2

def publish_to_kafka(file_path,producer=producer,chunk_size=1000):
    """ 
    Send each row to MySQl API and Kafka.
    
    Uncomment either the environment code if using .env file or 
    variable code to set directly.
    
    SLEEP variable can be adjusted as desired. 1 is the minimum recommended setting. 
    Setting to 0 will crash the Kafka server unless a more robust EC2 setup is created.
    """
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        while True:
            chunk = []
            for _ in range(chunk_size):
                try:
                    chunk.append(next(reader))
                except StopIteration:
                    break
            if not chunk:
                break
            for row in chunk:
                # Convert row to JSON
                json_data = dumps(row)
                
                # Send row to MYSQL API
                try:
                    response = requests.post(MY_SQL_API_URL, json=json_data)
                    response.raise_for_status()
                    print(f"API response: {response.json()}")
                except Exception as e:
                    print(f"Error sending row to API: {e}")

                # Publish row to Kafka
                producer.send('invoice', value=bytes(json_data, 'utf-8'))
                sleep(SLEEP)
            producer.flush()
    producer.close()
    producer.close()

In [None]:
publish_to_kafka("../data/Iowa_Liquor_Sales.csv")

Manually flush data when done

In [None]:
producer.flush()

Test

In [6]:
import pandas as pd

df = pd.read_csv("../data/Iowa_Liquor_Sales.csv", nrows=10)

print(df.columns)

Index(['Invoice/Item Number', 'Date', 'Store Number', 'Store Name', 'Address',
       'City', 'Zip Code', 'Store Location', 'County Number', 'County',
       'Category', 'Category Name', 'Vendor Number', 'Vendor Name',
       'Item Number', 'Item Description', 'Pack', 'Bottle Volume (ml)',
       'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold',
       'Sale (Dollars)', 'Volume Sold (Liters)', 'Volume Sold (Gallons)'],
      dtype='object')


In [None]:
# sample output
'''
{'Invoice/Item Number': 'S29198800001', 'Date': '11/20/2015', 'Store Number': '2191', 'Store Name': 'Keokuk Spirits', 
'Address': '1013 MAIN', 'City': 'KEOKUK', 'Zip Code': '52632', 'Store Location': '1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)', 
'County Number': '56', 'County': 'Lee', 'Category': '', 'Category Name': '', 'Vendor Number': '255', 'Vendor Name': 'Wilson Daniels Ltd.', 
'Item Number': '297', 'Item Description': 'Templeton Rye w/Flask', 'Pack': '6', 'Bottle Volume (ml)': '750', 'State Bottle Cost': '$18.09', 
'State Bottle Retail': '$27.14', 'Bottles Sold': '6', 'Sale (Dollars)': '$162.84', 'Volume Sold (Liters)': '4.50', 'Volume Sold (Gallons)': '1.19'}
'''

In [None]:
with open("../data/Iowa_Liquor_Sales.csv", 'r') as file:
    reader = csv.DictReader(file)
    while True:
        chunk = []
        for _ in range(1000):
            try:
                chunk.append(next(reader))
            except StopIteration:
                break
        if not chunk:
            break
        for row in chunk:
            print(row)