# Producer Notebook

This notebook contains the code for the Producer which creates the messages. The data source is the websocket from binance (https://binance-docs.github.io/apidocs/websocket_api/en/#change-log).

## Imports

In [1]:
from kafka import KafkaProducer, KafkaConsumer
import json
import uuid
import websocket
from datetime import datetime
import requests

url = "https://api.twitter.com/2/tweets/search/recent?query=%23crypto%20OR%20%23ethereum%20OR%20%23eth%20OR%20%23btc%20OR%20%23bitcoin&max_results=100&tweet.fields=created_at,text,author_id"

with open("secrets.env", "r") as f:
    secrets = json.loads(str(f.read()))

bearerToken = secrets['bearerToken']

## Defining functions for producing and consuming messages, defining producer and consumer

In [2]:
# defining connection
server1, server2, server3 = 'broker1:9093', 'broker2:9095', 'broker3:9097'
servers = [server1, server2, server3]
binance_topic, twitter_topic = "binance-ws", "twitter"

# using functions from example notebook
def connect_kafka_producer(servers):
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=servers, api_version=(0, 10))
        print("Created Producer")
    except Exception as ex:
        print('Exception while connecting Kafka')
        print(str(ex))
    finally:
        return _producer
    
def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding='utf-8')
        value_bytes = bytes(value, encoding='utf-8')
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        print('Message published successfully.')
    except Exception as ex:
        print('Exception in publishing message')
        print(str(ex))

def consume_xy(consumer, topic_name):
    messages = []
    for msg in consumer:
        messages.append((msg.key.decode('utf-8'), msg.value))
    print(f"Consumed {len(messages)} messages from Kafka Cluster")
    return messages

def connect_kafka_producer(servers):
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=servers, api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka')
        print(str(ex))
    finally:
        return _producer

# setting up consumer for binance
binance_consumer = KafkaConsumer(binance_topic, 
                         auto_offset_reset='earliest',
                         bootstrap_servers=servers,
                         api_version=(0, 10), 
                         value_deserializer = json.loads,
                         consumer_timeout_ms=1000)

# setting up new consumer for twitter messages
twitter_consumer = KafkaConsumer(twitter_topic, 
                         auto_offset_reset='earliest',
                         bootstrap_servers=servers,
                         api_version=(0, 10), 
                         value_deserializer = json.loads,
                         consumer_timeout_ms=1000)

producer = connect_kafka_producer(servers)

## Adjusting websocket code to write data directly into Kafka Cluster.

In [7]:
test_array = []

def on_open(ws):
    print("Connection opened")
    # Subscribe to different ticker streams
    subscribe_message = {
        "method": "SUBSCRIBE",
        "params": [
            "btcusdt@ticker",
            "ethusdt@ticker",
            "maticusdt@ticker",
            "shibusdt@ticker",
            "solusdt@ticker"
        ],
        "id": 1
    }
    ws.send(json.dumps(subscribe_message))

def on_message(ws, message):
    # close after 3 seconds for testing
    if (datetime.now() - start).seconds > 3:
        ws.close()

    # write data into cluster
    key = str(uuid.uuid4())
    publish_message(producer, binance_topic, key, message)
    print(f"{datetime.now()}: writing message to cluster: {str(message)[:50]}...")
    
    # append data to array for testing
    test_array.append((key, json.loads(message)))

def on_close(ws):
    print("Connection closed")

if __name__ == "__main__":
    # Initialize WebSocket connection
    ws = websocket.WebSocketApp(
        "wss://stream.binance.com:9443/ws",
        on_open=on_open,
        on_message=on_message,
        on_close=on_close
    )
    # only for testing
    start = datetime.now()
    # Start WebSocket connection
    ws.run_forever()

Connection opened
Message published successfully.
2023-03-01 14:42:00.563192: writing message to cluster: {"result":null,"id":1}...
Message published successfully.
2023-03-01 14:42:00.777560: writing message to cluster: {"e":"24hrTicker","E":1677681720795,"s":"BTCUSDT",...
Message published successfully.
2023-03-01 14:42:00.780052: writing message to cluster: {"e":"24hrTicker","E":1677681720796,"s":"ETHUSDT",...
Message published successfully.
2023-03-01 14:42:00.870126: writing message to cluster: {"e":"24hrTicker","E":1677681720846,"s":"SHIBUSDT"...
Message published successfully.
2023-03-01 14:42:00.905272: writing message to cluster: {"e":"24hrTicker","E":1677681720761,"s":"SOLUSDT",...
Message published successfully.
2023-03-01 14:42:01.270851: writing message to cluster: {"e":"24hrTicker","E":1677681721199,"s":"MATICUSDT...
Message published successfully.
2023-03-01 14:42:01.831673: writing message to cluster: {"e":"24hrTicker","E":1677681721796,"s":"BTCUSDT",...
Message publishe

### Testing produced and consumed messages

In [8]:
consumed_messages = consume_xy(binance_consumer, binance_topic)

# testing if write & read operations work as intended
for i, message in enumerate(consumed_messages):
    assert message[1] == test_array[i][1]
    assert message[0] == test_array[i][0]
print("Consumed messages match produced messages.")

Consumed 14 messages from Kafka Cluster
Consumed messages match produced messages.


## Twitter Producer

In [9]:
test_array = []
start, timestamp_last_request = datetime.now(), datetime.now()


if __name__ == "__main__":
    while (datetime.now() - start).seconds < 21:
        # time limit to stay within api regulations
        if (datetime.now() - timestamp_last_request).seconds > 10:
            # get tweets
            tweets = json.loads(requests.get(url=url, headers={'Authorization': f"Bearer {bearerToken}"}).text)['data']
            for tweet in tweets:
                # formatting for json serializing
                formatted_tweet = json.dumps(tweet)
                
                # write to cluster
                key = str(uuid.uuid4())
                publish_message(producer, twitter_topic, key, formatted_tweet)
                print(f"{datetime.now()}: writing message to cluster: {formatted_tweet[:50]}...")
                
                # append to test_array
                test_array.append((key, formatted_tweet))
            
            timestamp_last_request = datetime.now()

Message published successfully.
2023-03-01 14:42:16.975186: writing message to cluster: {"author_id": "896966032841097217", "edit_history_...
Message published successfully.
2023-03-01 14:42:16.976599: writing message to cluster: {"author_id": "1619566342511927299", "edit_history...
Message published successfully.
2023-03-01 14:42:16.977596: writing message to cluster: {"author_id": "1621023784244510725", "edit_history...
Message published successfully.
2023-03-01 14:42:16.978598: writing message to cluster: {"author_id": "1539171807755919361", "edit_history...
Message published successfully.
2023-03-01 14:42:16.979607: writing message to cluster: {"author_id": "1599686721297211392", "edit_history...
Message published successfully.
2023-03-01 14:42:16.980839: writing message to cluster: {"author_id": "1539533124073402368", "edit_history...
Message published successfully.
2023-03-01 14:42:16.981844: writing message to cluster: {"author_id": "1620671526726148097", "edit_history...
Messag

### Testing produced and consumed messages

In [10]:
consumed_messages = consume_xy(twitter_consumer, twitter_topic)

# testing if write & read operations work as intended
for i, message in enumerate(consumed_messages):
    assert message[1] == json.loads(test_array[i][1])
    assert message[0] == test_array[i][0]
print("Consumed messages match produced messages.")

Consumed 100 messages from Kafka Cluster
Consumed messages match produced messages.
