In [7]:
import os
from river import datasets
from confluent_kafka import Producer,Consumer
import certifi
import time
import json

In [19]:
user= os.environ['kafka_username']
password= os.environ['kafka_password']
bsts= os.environ['kafka_bootstrap_servers']

In [20]:
topic = 'malicious_url_events'

In [21]:
conf = {'bootstrap.servers': bsts,
            'sasl.mechanism': 'PLAIN',
            'security.protocol': 'SASL_SSL',
            'ssl.ca.location': certifi.where(),
            'sasl.username': user,
            'sasl.password': password,
            'batch.num.messages': 2048,
            #'queue.buffering.max.messages': 100,
            'linger.ms': 100,
            'client.id': 'producer-icde-2023'}
producer = Producer(conf)    

In [29]:
max_size=1000000
dataset = datasets.MaliciousURL()
data = dataset.take(max_size)


In [30]:
cnt = 0
st = time.time()
abs_st = time.time()
for f, y in data:
    cnt = cnt + 1    
    d = {}
    d['f']=f
    d['y']=str(y).lower()
    d['st']=time.time()  
            
    v= json.dumps(d).encode('utf-8')
    try:
        producer.produce(topic, value=v, key=str(cnt))
    except:
      print(f'Queue full, flushing {cnt}')
      producer.flush()
      producer.produce(topic, value=v, key=str(cnt))
    if cnt%10000==0:           
        end = time.time()
        print(f'flushing count - {cnt}, time taken in seconds- {end-st} ')        
        producer.flush()
        st = time.time()        
producer.flush()
end = time.time()
print(f'final flushing count - {cnt}, time taken in seconds- {end-abs_st} ')        

flushing count - 10000, time taken in seconds- 1.6444752216339111 
flushing count - 20000, time taken in seconds- 1.5919137001037598 
flushing count - 30000, time taken in seconds- 1.6054317951202393 
flushing count - 40000, time taken in seconds- 1.5896944999694824 
flushing count - 50000, time taken in seconds- 1.576538324356079 
flushing count - 60000, time taken in seconds- 1.6794867515563965 
flushing count - 70000, time taken in seconds- 1.6720309257507324 
flushing count - 80000, time taken in seconds- 1.6829123497009277 
flushing count - 90000, time taken in seconds- 1.598083734512329 
flushing count - 100000, time taken in seconds- 1.5724234580993652 
flushing count - 110000, time taken in seconds- 1.7231109142303467 
flushing count - 120000, time taken in seconds- 1.6440880298614502 
flushing count - 130000, time taken in seconds- 1.6821281909942627 
flushing count - 140000, time taken in seconds- 1.6320946216583252 
flushing count - 150000, time taken in seconds- 1.765151023

In [31]:
os.environ['PREDICTION_TOPIC_PREFIX']='malicious_url_predictions'
os.environ['PREDICTION_TOPIC_SUFFIX']='_8'
prediction_topic_prefix = os.environ['PREDICTION_TOPIC_PREFIX']
prediction_topic_suffix=os.environ['PREDICTION_TOPIC_SUFFIX']
PREDICTION_TOPIC=f'{prediction_topic_prefix}{prediction_topic_suffix}'

In [26]:
from river import metrics
auc = metrics.ROCAUC()
f1 = metrics.F1()
recall = metrics.MicroRecall()
predictions_consumer_conf = {'bootstrap.servers': bsts,
                     'sasl.username': user,
                     'sasl.password': password,
                     'sasl.mechanism': 'PLAIN',
                     'security.protocol': 'SASL_SSL',
                     'ssl.ca.location': certifi.where(),
                     'group.id': 'prediction-grp',
                     'enable.auto.commit': True,
                     'auto.offset.reset': 'latest'}
predictions_consumer = Consumer(predictions_consumer_conf)    
predictions_consumer.subscribe([PREDICTION_TOPIC])

In [None]:
{'bootstrap.servers': bsts,
                     'sasl.username': user,
                     'sasl.password': password,
                     'sasl.mechanism': 'PLAIN',
                     'security.protocol': 'SASL_SSL',
                     'ssl.ca.location': certifi.where(),
                     'group.id': 'prediction-grp',
                     'enable.auto.commit': True,
                     'auto.offset.reset': 'latest'}

In [15]:

durs = []
mem_usage = []
cnt = 0
while(cnt<10000:
    msg = predictions_consumer.poll(timeout=0.1)
    
    if msg is None: continue
    if msg.error():
        if msg.error().code() == KafkaError._PARTITION_EOF:
                sys.stderr.write('%% %s [%d] reached end at offset %d\n' %
                                         (msg.topic(), msg.partition(), msg.offset()))
    else:            
        message = json.loads(msg.value().decode("utf-8"))
        if(cnt%10000==0):
            print(message)
            
        cnt = cnt + 1
        score = message['score']
        y = message['y']
        durs.append(message['duration'])
        mem_usage.append(message['mem_usage'])
        auc = auc.update(y, score)
        f1 = f1.update(y, score)
        recall = recall.update(y, score)
            

{'y': True, 'score': True, 'duration': 180.6090109348297, 'mem_usage': 38928296}


KeyboardInterrupt: 

In [18]:
import statistics
mean = statistics.mean(durs)
median = statistics.median(durs)
max_dur = max(durs)
min_dur = min(durs)
avg_dur = min(durs)
total_records = len(durs)
memory_usage = statistics.mean(mem_usage)
print(max_dur)
print(min_dur)
print(avg_dur)
print(auc)
print(f1)
print(recall)
print(memory_usage)

306.9113175868988
180.6090109348297
180.6090109348297
ROCAUC: 86.20%
F1: 86.39%
MicroRecall: 86.20%
48231842.27287319
