# Testing different producers

This notebook contains the code for the Producer which creates the messages. The data source is the websocket from binance (https://binance-docs.github.io/apidocs/websocket_api/en/#change-log) and the Twitter API.

## Imports

In [95]:
from kafka import KafkaProducer, KafkaConsumer
import json
import uuid
import websocket
from datetime import datetime
import requests

url = "https://api.twitter.com/2/tweets/search/recent?query=%23crypto%20OR%20%23ethereum%20OR%20%23eth%20OR%20%23btc%20OR%20%23bitcoin&max_results=100&tweet.fields=created_at,text,author_id"

with open("secrets.env", "r") as f:
    secrets = json.loads(str(f.read()))

bearerToken = secrets['bearerToken']

## Adjusting websocket code to write data directly into Kafka Cluster.

In [1]:
from kafka_functions import *
from datetime import datetime
import websocket
import uuid
import requests

url = "https://api.twitter.com/2/tweets/search/recent?query=%23crypto%20OR%20%23ethereum%20OR%20%23eth%20OR%20%23btc%20OR%20%23bitcoin&max_results=100&tweet.fields=created_at,text,author_id"

with open("secrets.env", "r") as f:
    secrets = json.loads(str(f.read()))

bearerToken = secrets['bearerToken']

In [2]:
test_array = []

def on_open(ws):
    print("Connection opened")
    # Subscribe to different ticker streams
    subscribe_message = {
        "method": "SUBSCRIBE",
        "params": [
            "btcusdt@ticker",
            "ethusdt@ticker",
            "maticusdt@ticker",
            "shibusdt@ticker",
            "solusdt@ticker"
        ],
        "id": 1
    }
    ws.send(json.dumps(subscribe_message))

def on_message(ws, message):
    # close after 3 seconds for testing
    if (datetime.now() - start).seconds > 7:
        ws.close()

    # write data into cluster
    key = str(uuid.uuid4())
    publish_message(binance_producer, binance_topic, key, json.loads(message))
    print(f"{datetime.now()}: writing message to cluster: {str(message)[:50]}...")
    
    # append data to array for testing
    test_array.append((key, json.loads(message)))

def on_close(ws):
    print("Connection closed")

binance_start = datetime.timestamp(datetime.now())

# consume messages from previous sessions
_ = consume_messages(binance_consumer)

if __name__ == "__main__":
    # Initialize WebSocket connection
    ws = websocket.WebSocketApp(
        "wss://stream.binance.com:9443/ws",
        on_open=on_open,
        on_message=on_message,
        on_close=on_close
    )
    # only for testing
    start = datetime.now()
    # Start WebSocket connection
    ws.run_forever()

Consumed 71 messages from Kafka Cluster
Connection opened
Using <function <lambda> at 0x7f5bc04001f0> as value serializer.
Using <function <lambda> at 0x7f5bc0400310> as key serializer.
Message published successfully.
2023-03-09 09:36:44.676626: writing message to cluster: {"result":null,"id":1}...
Using <function <lambda> at 0x7f5bc04001f0> as value serializer.
Using <function <lambda> at 0x7f5bc0400310> as key serializer.
Message published successfully.
2023-03-09 09:36:44.696084: writing message to cluster: {"e":"24hrTicker","E":1678354604586,"s":"SOLUSDT",...
Using <function <lambda> at 0x7f5bc04001f0> as value serializer.
Using <function <lambda> at 0x7f5bc0400310> as key serializer.
Message published successfully.
2023-03-09 09:36:45.265317: writing message to cluster: {"e":"24hrTicker","E":1678354604854,"s":"SHIBUSDT"...
Using <function <lambda> at 0x7f5bc04001f0> as value serializer.
Using <function <lambda> at 0x7f5bc0400310> as key serializer.
Message published successfully.


### Testing produced and consumed messages

In [4]:
# retrieving messages
consumed_messages = consume_messages(binance_consumer)

# testing if write & read operations work as intended
for i, message in enumerate(consumed_messages):
    assert message[1] == test_array[i][1]
    assert message[0].decode('utf-8') == test_array[i][0]
print("Consumed messages match produced messages.")

Consumed 765 messages from Kafka Cluster


AssertionError: 

## Twitter Producer

In [2]:
test_array = []
start, timestamp_last_request = datetime.now(), datetime.now()

twitter_start = datetime.timestamp(datetime.now())

# consume messages from other sessions
_ = consume_messages(twitter_consumer)

if __name__ == "__main__":
    while (datetime.now() - start).seconds < 21:
        # time limit to stay within api regulations
        if (datetime.now() - timestamp_last_request).seconds > 10:
            # get tweets
            tweets = json.loads(requests.get(url=url, headers={'Authorization': f"Bearer {bearerToken}"}).text)['data']
            for tweet in tweets:
                # formatting for json serializing
                formatted_tweet = json.dumps(tweet)
                
                # write to cluster
                key = str(uuid.uuid4())
                publish_message(twitter_producer, twitter_topic, key, formatted_tweet)
                print(f"{datetime.now()}: writing message to cluster: {formatted_tweet[:50]}...")
                
                # append to test_array
                test_array.append((key, formatted_tweet))
            
            timestamp_last_request = datetime.now()

Consumed 105 messages from Kafka Cluster
Using <function <lambda> at 0x7fa23abbee60> as value serializer.
Using <function <lambda> at 0x7fa23abbef80> as key serializer.
Message published successfully.
2023-03-09 09:40:36.529657: writing message to cluster: {"created_at": "2023-03-09T09:40:25.000Z", "id": "...
Using <function <lambda> at 0x7fa23abbee60> as value serializer.
Using <function <lambda> at 0x7fa23abbef80> as key serializer.
Message published successfully.
2023-03-09 09:40:36.531426: writing message to cluster: {"created_at": "2023-03-09T09:40:25.000Z", "id": "...
Using <function <lambda> at 0x7fa23abbee60> as value serializer.
Using <function <lambda> at 0x7fa23abbef80> as key serializer.
Message published successfully.
2023-03-09 09:40:36.533172: writing message to cluster: {"created_at": "2023-03-09T09:40:25.000Z", "id": "...
Using <function <lambda> at 0x7fa23abbee60> as value serializer.
Using <function <lambda> at 0x7fa23abbef80> as key serializer.
Message published suc

### Testing produced and consumed messages

In [3]:
# retrieve messages
consumed_messages = consume_messages(twitter_consumer)

# testing if write & read operations work as intended
for i, message in enumerate(consumed_messages):
    assert message[1] == test_array[i][1]
    assert message[0] == test_array[i][0]
print("Consumed messages match produced messages.")

Consumed 100 messages from Kafka Cluster
Consumed messages match produced messages.


In [5]:
messages = consume_messages(twitter_consumer)

Consumed 700 messages from Kafka Cluster
