In [2]:
import os
from river import datasets
from river import metrics
from river import tree
from river import ensemble
from river import evaluate
from river import compose
from river import naive_bayes
from river import anomaly
from river import compose
from river import datasets
from river import metrics
from river import preprocessing
from confluent_kafka import Producer,Consumer
import certifi
import time
import json
import pandas

In [3]:
classifiers=[]
classifier='HoeffdingAdaptiveTreeClassifier'
classifiers.append(classifier)
classifier='SRPClassifierHAT'
classifiers.append(classifier)
classifier='SRPClassifierNaiveBayes'
classifiers.append(classifier)
classifier='AdaptiveRandomForestClassifier'
classifiers.append(classifier)
classifier='HalfSpaceTrees'
classifiers.append(classifier)
model_artifact = None

In [13]:
max_size=1000
dataset = datasets.MaliciousURL()
d_train=[]
d_predict=[]
for classifier in classifiers:
    data = dataset.take(max_size)
    if classifier=='HoeffdingAdaptiveTreeClassifier':
            model_artifact = tree.HoeffdingAdaptiveTreeClassifier(grace_period=100,  delta=1e-5, leaf_prediction='nb', nb_threshold=10,seed=0)
    elif classifier=='SRPClassifierHAT':
            model_artifact =  ensemble.SRPClassifier(
                                                   model=tree.HoeffdingAdaptiveTreeClassifier(grace_period=100,  delta=1e-5, leaf_prediction='nb', nb_threshold=10, seed=0), seed=42,
                                                )
    elif classifier=='SRPClassifierNaiveBayes':
            model_artifact = ensemble.SRPClassifier(
                                                 model=naive_bayes.BernoulliNB(alpha=1), seed=42,
                                              )
    elif classifier=='AdaptiveRandomForestClassifier':
            model_artifact = ensemble.AdaptiveRandomForestClassifier(leaf_prediction="mc")

    elif classifier=='HalfSpaceTrees':
            model_artifact = compose.Pipeline(preprocessing.MinMaxScaler(),anomaly.HalfSpaceTrees(seed=42))
    cnt = 0
    training_st_ts = time.time()
    for f, y in data:
        cnt = cnt + 1
        model_artifact = model_artifact.learn_one(f,y)
    training_end_ts = time.time()
    total_duration = training_end_ts-training_st_ts
    avg_duration = (total_duration/max_size)     
    d_train.append({'Classifier':classifier,
              'Avg Model Training Time(ms)': avg_duration * 1000,
              'Expected Train Time 1 million records (s)':avg_duration * 1000000,
              'Expected time 1 million (mins)': avg_ts * (1000000/60),
              'Max Training Throughput (sub-second-response) ': round(1/avg_duration)})
    
    prediction_st_ts = time.time()
    for f, y in data:
        cnt = cnt + 1
        model_artifact.predict_one(f)
    prediction_end_ts = time.time()
    total_duration = prediction_end_ts-prediction_st_ts
    avg_duration = (total_duration/max_size)     
    d_predict.append({'Classifier':classifier,
              'Avg Model Prediction Time(ms)': avg_duration * 1000,
              'Expected Prediction Time 1 million records (s)':avg_duration * 1000000,
              'Expected time 1 million (mins)': avg_ts * (1000000/60),
              'Max Prediction Throughput (sub-second-response) ': round(1/avg_duration)})



In [14]:
print('Training')
df_train = pandas.DataFrame(data=d_train)
display(df_train)

print('Prediction')
df_predict = pandas.DataFrame(data=d_predict)
display(df_predict)

Training


Unnamed: 0,Classifier,Avg Model Training Time(ms),Expected Train Time 1 million records (s),Expected time 1 million (mins),Max Training Throughput (sub-second-response)
0,HoeffdingAdaptiveTreeClassifier,1.817648,1817.648411,0.011921,550
1,SRPClassifierHAT,28.374372,28374.372005,0.011921,35
2,SRPClassifierNaiveBayes,4.335978,4335.977793,0.011921,231
3,AdaptiveRandomForestClassifier,1.022464,1022.464037,0.011921,978
4,HalfSpaceTrees,0.221211,221.211195,0.011921,4521


Prediction


Unnamed: 0,Classifier,Avg Model Prediction Time(ms),Expected Prediction Time 1 million records (s),Expected time 1 million (mins),Max Prediction Throughput (sub-second-response)
0,HoeffdingAdaptiveTreeClassifier,2.384186e-07,0.000238,0.011921,4194304000
1,SRPClassifierHAT,7.152557e-07,0.000715,0.011921,1398101333
2,SRPClassifierNaiveBayes,2.384186e-07,0.000238,0.011921,4194304000
3,AdaptiveRandomForestClassifier,2.384186e-07,0.000238,0.011921,4194304000
4,HalfSpaceTrees,4.768372e-07,0.000477,0.011921,2097152000


## Results for Maximum Sustainable Throughtput Calculations 
Maximum sustainable throughput is the number of records we can train/predict in a single thread in 1 second

|    | Classifier                      |   Avg Model Training Time(ms) |   Expected Train Time 1 million records (s) |   Expected time 1 million (mins) |   Max Training Throughput (sub-second-response)  |
|---:|:--------------------------------|------------------------------:|--------------------------------------------:|---------------------------------:|-------------------------------------------------:|
|  0 | HoeffdingAdaptiveTreeClassifier |                      1.81765  |                                    1817.65  |                        0.0119209 |                                              550 |
|  1 | SRPClassifierHAT                |                     28.3744   |                                   28374.4   |                        0.0119209 |                                               35 |
|  2 | SRPClassifierNaiveBayes         |                      4.33598  |                                    4335.98  |                        0.0119209 |                                              231 |
|  3 | AdaptiveRandomForestClassifier  |                      1.02246  |                                    1022.46  |                        0.0119209 |                                              978 |
|  4 | HalfSpaceTrees                  |                      0.221211 |                                     221.211 |                        0.0119209 |                                             4521 |

|    | Classifier                      |   Avg Model Prediction Time(ms) |   Expected Prediction Time 1 million records (s) |   Expected time 1 million (mins) |   Max Prediction Throughput (sub-second-response)  |
|---:|:--------------------------------|--------------------------------:|-------------------------------------------------:|---------------------------------:|---------------------------------------------------:|
|  0 | HoeffdingAdaptiveTreeClassifier |                     2.38419e-07 |                                      0.000238419 |                        0.0119209 |                                         4194304000 |
|  1 | SRPClassifierHAT                |                     7.15256e-07 |                                      0.000715256 |                        0.0119209 |                                         1398101333 |
|  2 | SRPClassifierNaiveBayes         |                     2.38419e-07 |                                      0.000238419 |                        0.0119209 |                                         4194304000 |\n|  3 | AdaptiveRandomForestClassifier  |                     2.38419e-07 |                                      0.000238419 |                        0.0119209 |                                         4194304000 |
|  4 | HalfSpaceTrees                  |                     4.76837e-07 |                                      0.000476837 |                        0.0119209 |                                         2097152000 |