In [1]:
import os
from river import datasets
from confluent_kafka import Producer,Consumer
import certifi
import time
import json

In [2]:
user= os.environ['kafka_username']
password= os.environ['kafka_password']
bsts= os.environ['kafka_bootstrap_servers']

In [3]:
topic = 'malicious_url_events'

In [4]:
conf = {'bootstrap.servers': bsts,
            'sasl.mechanism': 'PLAIN',
            'security.protocol': 'SASL_SSL',
            'ssl.ca.location': certifi.where(),
            'sasl.username': user,
            'sasl.password': password,
            'batch.num.messages': 2048,
            #'queue.buffering.max.messages': 100,
            'linger.ms': 100,
            'client.id': 'producer-icde-2023'}
producer = Producer(conf)    

In [5]:
max_size=10000
dataset = datasets.MaliciousURL()
data = dataset.take(max_size)


In [6]:
cnt = 0
st = time.time()
abs_st = time.time()
for f, y in data:
    cnt = cnt + 1    
    d = {}
    d['f']=f
    d['y']=str(y).lower()
    d['st']=time.time()  
            
    v= json.dumps(d).encode('utf-8')
    try:
        producer.produce(topic, value=v, key=str(cnt))
    except:
      print(f'Queue full, flushing {cnt}')
      producer.flush()
      producer.produce(topic, value=v, key=str(cnt))
    if cnt%1024==0:           
        #end = time.time()
        print(f'flushing count - {cnt}, time taken in seconds- {end-st} ')        
        producer.flush()
        time.sleep(1)
        st = time.time()        
        
producer.flush()
end = time.time()
print(f'final flushing count - {cnt}, time taken in seconds- {end-abs_st} ')        

final flushing count - 10000, time taken in seconds- 13.317464590072632 


In [None]:
os.environ['PREDICTION_TOPIC_PREFIX']='malicious_url_predictions'
os.environ['PREDICTION_TOPIC_SUFFIX']='_8'
prediction_topic_prefix = os.environ['PREDICTION_TOPIC_PREFIX']
prediction_topic_suffix=os.environ['PREDICTION_TOPIC_SUFFIX']
PREDICTION_TOPIC=f'{prediction_topic_prefix}{prediction_topic_suffix}'

In [None]:
from river import metrics
auc = metrics.ROCAUC()
f1 = metrics.F1()
recall = metrics.MicroRecall()
predictions_consumer_conf = {'bootstrap.servers': bsts,
                     'sasl.username': user,
                     'sasl.password': password,
                     'sasl.mechanism': 'PLAIN',
                     'security.protocol': 'SASL_SSL',
                     'ssl.ca.location': certifi.where(),
                     'group.id': 'prediction-grp',
                     'enable.auto.commit': True,
                     'auto.offset.reset': 'latest'}
predictions_consumer = Consumer(predictions_consumer_conf)    
predictions_consumer.subscribe([PREDICTION_TOPIC])

In [None]:
{'bootstrap.servers': bsts,
                     'sasl.username': user,
                     'sasl.password': password,
                     'sasl.mechanism': 'PLAIN',
                     'security.protocol': 'SASL_SSL',
                     'ssl.ca.location': certifi.where(),
                     'group.id': 'prediction-grp',
                     'enable.auto.commit': True,
                     'auto.offset.reset': 'latest'}

In [None]:

durs = []
mem_usage = []
cnt = 0
while(cnt<10000):
    msg = predictions_consumer.poll(timeout=0.1)
    
    if msg is None: continue
    if msg.error():
        if msg.error().code() == KafkaError._PARTITION_EOF:
                sys.stderr.write('%% %s [%d] reached end at offset %d\n' %
                                         (msg.topic(), msg.partition(), msg.offset()))
    else:            
        message = json.loads(msg.value().decode("utf-8"))
        if(cnt%1000==0):
            print(message)
            
        cnt = cnt + 1
        score = message['score']
        y = message['y']
        durs.append(message['duration'])
        mem_usage.append(message['mem_usage'])
        auc = auc.update(y, score)
        f1 = f1.update(y, score)
        recall = recall.update(y, score)
            

In [None]:
import statistics
mean = statistics.mean(durs)
median = statistics.median(durs)
max_dur = max(durs)
min_dur = min(durs)
avg_dur = min(durs)
total_records = len(durs)
memory_usage = statistics.mean(mem_usage)
print(max_dur)
print(min_dur)
print(avg_dur)
print(auc)
print(f1)
print(recall)
print(memory_usage)

In [None]:
from river import datasets
from river import metrics
from river import tree
from river import ensemble
from river import evaluate
from river import compose
from river import naive_bayes
from time import time

from river import anomaly
from river import compose
from river import datasets
from river import metrics
from river import preprocessing

In [None]:
model = ensemble.AdaptiveRandomForestClassifier(seed=8, leaf_prediction="mc")

In [None]:
dir(ensemble)