In [37]:
import os
from river import datasets
from river import ensemble
from confluent_kafka import Producer,Consumer
import certifi
import time
import json
import uuid

## Define the Producer Parameters

1. `feature_topic` - `features` is the name of the Kafka topic
2. `flush_size` - Number of data points per second
3.  `max_size` - Number of data points published to the `features` topics

In [42]:
## Configure publish parameters

feature_topic = 'features'

#flush_size and sleep_time throttle the number of feature records written to the feature topic per second
flush_size=350


#Maximum number of records processed. We use a small number 5000. But a typical size would be 100000
#In the practice, the features are arriving continuously.
max_size=5000



In [39]:
user= os.environ['KAFKA_USER_NAME']
password= os.environ['KAFKA_PASSWORD']
bsts= os.environ['KAFKA_BOOTSTRAP_SERVERS']

### Configure Producer Topic



In [40]:
conf = {'bootstrap.servers': bsts,
            'sasl.mechanism': 'PLAIN',
            'security.protocol': 'SASL_SSL',
            'ssl.ca.location': certifi.where(),
            'sasl.username': user,
            'sasl.password': password,
            'batch.num.messages': 4000,
            'linger.ms': 100,
            'client.id': 'producer-icde-2023'}
producer = Producer(conf)    

### Publish messages to the Kafka Topic


In [41]:
#Dataset used
sleep_time=1
dataset = datasets.MaliciousURL()
data = dataset.take(max_size)
end=0
cnt = 0
st = time.time()
abs_st = time.time()
for f, y in data:
    cnt = cnt + 1    
    d = {}
    d['id'] = str(uuid.uuid4())
    d['f']=f
    d['y']=str(y).lower()
    d['st']=time.time()  
            
    v= json.dumps(d).encode('utf-8')
    try:
        producer.produce(feature_topic, value=v, key=str(cnt))
        if(cnt%flush_size==0):
            print(f'Queue full, flushing {cnt}')
            producer.flush()
            time.sleep(sleep_time)
    except:
      print(f'Queue full, flushing {cnt}')
      producer.flush()
      end = time.time()
      print(f'flushing count - {cnt}, time taken in seconds- {end-st} ')        
        
      producer.produce(feature_topic, value=v, key=str(cnt))
 
producer.flush()
end = time.time()
print(f'final flushing count - {cnt}, time taken in seconds- {end-abs_st} ')        

Queue full, flushing 350
Queue full, flushing 700
Queue full, flushing 1050
Queue full, flushing 1400
Queue full, flushing 1750
Queue full, flushing 2100
Queue full, flushing 2450
Queue full, flushing 2800
Queue full, flushing 3150
Queue full, flushing 3500
Queue full, flushing 3850
Queue full, flushing 4200
Queue full, flushing 4550
Queue full, flushing 4900
final flushing count - 5000, time taken in seconds- 17.491112232208252 
