In [10]:
import os

from river import datasets
from river import metrics
from river import tree
from river import ensemble
from river import evaluate
from river import compose
from river import naive_bayes
from river import anomaly
from river import compose
from river import datasets
from river import metrics
from river import preprocessing
from confluent_kafka import Producer,Consumer
import certifi
import time
import json
import pandas

In [11]:
user= os.environ['kafka_username']
password= os.environ['kafka_password']
bsts= os.environ['kafka_bootstrap_servers']

In [12]:
feature_topic = 'features_v1'
model = compose.Pipeline(preprocessing.MinMaxScaler(),anomaly.HalfSpaceTrees(seed=42))

In [13]:
def consume_messages(group_id,model,only_predict=False):    
    features_consumer_conf = {'bootstrap.servers': bsts,
                          'sasl.username': user,
                          'sasl.password': password,
                          'sasl.mechanism': 'PLAIN',
                          'security.protocol': 'SASL_SSL',
                          'ssl.ca.location': certifi.where(),
                          'group.id': group_id,
                          'enable.auto.commit': True,
                          'auto.commit.interval.ms':1000,         
                          'auto.offset.reset': 'latest'}
    features_consumer = Consumer(features_consumer_conf)  
    
    print(f'\nNow subscribing to features topic:{feature_topic}')
        
    features_consumer.subscribe([feature_topic])
    cnt = 0
    msg = None
    error_cnt = 0
    end_learn_ts = 0
    st_learn_ts = 0

    st_processing_time = 0
    
    learning_durations=[]
    prediction_durations=[]
    processing_durations = []
    score_and_truth = []
    mem_usage = []
    end_to_end_processing_durations = []
    while(True):           
        messages = features_consumer.consume(num_messages=10000,timeout=0.1)    
        if len(messages)==0: continue
        for msg in messages:
            if msg is None: continue
            if msg.error():
                error_cnt = error_cnt + 1
                if msg.error().code() == KafkaError._PARTITION_EOF:                    
                        if(error_cnt%1000==0):
                            print('error')
                            print(msg)
                        sys.stderr.write('%% %s [%d] reached end at offset %d\n' %
                                                 (msg.topic(), msg.partition(), msg.offset()))
            else:       
                try:         
                    msg_arrival_time = time.time()
                    message = json.loads(msg.value().decode("utf-8"))            
                    cnt = cnt + 1

                    f = message['f']
                    y = (message['y']=='true')              
                    msg_produce_ts = message['st']
                    if(cnt==1):
                        st_processing_time = time.time()

                    st_prediction_time = time.time()            
                    score = model_artifact.predict_one(f)
                    score_and_truth.append({'y':y,'score':score})
                    end_prediction_time = time.time()  
                    prediction_durations.append(end_prediction_time-st_prediction_time)

                    if not only_predict:
                        st_learn_ts = time.time()
                        model_artifact = model_artifact.learn_one(f,y)      
                        end_learn_ts = time.time()
                        learning_durations.append(end_learn_ts-st_learn_ts)            

                    msg_departure_time = time.time()
                    processing_durations.append(msg_departure_time-msg_arrival_time)
                    end_to_end_processing_durations.append(msg_departure_time-msg_produce_ts)
                    if(cnt%100==0):
                        mem_usage.append(model_artifact._raw_memory_usage)
                except Exception as  e:      
                    print(json.loads(msg.value().decode("utf-8")))
                    print(e, file=sys.stdout)
                    ignored = ignored + 1
                    print(f'ignored ={ignored} total = {cnt}')

    print('CLOSING')
    features_consumer.commit()
    features_consumer.close() 
    total_time = time.time() - st_processing_time
    return score_and_truth,processing_durations,end_to_end_processing_durations, prediction_durations, learning_durations,mem_usage,total_time

In [14]:
import statistics
from river import metrics

def print_time_results(durations, type_of_duration):
    if len(durations)==0:
        return
    mean = statistics.mean(durations)
    median = statistics.median(durations)
    max_dur = max(durations)
    min_dur = min(durations)  
    print(f'Type of durations : {type_of_duration} ' )
    print(f'\tAVG : {mean}')
    print(f'\MEDIAN : {median}')
    print(f'\MAX : {max_dur}')
    print(f'\MIN : {min_dur}')
    
def print_results(score_and_truth,processing_durations,end_to_end_processing_durations, prediction_durations, learning_durations,mem_usage,total_time):
    auc = metrics.ROCAUC()
    f1 = metrics.F1()
    recall = metrics.MicroRecall()
    for m in score:
        y = m['y']
        score = m['score']
        auc = auc.update(y,score)
        f1 = f1.update(y, score)
        recall = recall.update(y, score)
    
  
    total_records = len(durations)
    avg_memory_usage = statistics.mean(mem_usage)
    print(f'Messages consumed:{total_records},Total Cumulative Time: {total_time}')    
    print(f'AUC{auc}')
    print(f'F1 {f1}')
    print(f'RECALL {recall}')
    print(f'AVERAGE MEMORY USAGE {avg_memory_usage}')
    print_time_results(processing_durations,f'PROCESSING DURATIONS FOR {processing_durations}')
    print_time_results(prediction_durations,f'PREDICTION DURATIONS FOR {prediction_durations}')
    print_time_results(prediction_durations,f'LEARNING DURATIONS FOR {learning_durations}')
    print_time_results(processing_durations,f'END TO END PROCESSING DURATIONS FOR {processing_durations}')



In [15]:

group_id = 'HSFT_1'
only_predict=False
score_and_truth,processing_durations, end_to_end_processing_durations,prediction_durations, learning_durations,mem_usage,total_time = consume_messages(group_id,model,only_predict)
print_results(score_and_truth,processing_durations, end_to_end_processing_durations,prediction_durations, learning_durations,mem_usage,total_time)


Now subscribing to features topic:features_v1


KeyboardInterrupt: 