# Incremental Learning 1: Two-Step Incremental Learning

1. Retrieve DGA model (M1) trained on full dataset minus sunburst and the sunburst dataset (S).
2. Split sunburst dataset into an update dataset (U) and a test dataset (T).
3. Evaluate M1 on T.
4. Update M1 with U (M2) and evaluate on T.
5. Compare M1 and M2 accuracy on T.

In [31]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

from incremental_learning.config import datasets_dir, root_dir, jobs_dir
from incremental_learning.job import update, evaluate, Job
from incremental_learning.storage import download_dataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Retrieve DGA model (M1) and sunburst dataset (S)

In [5]:
dataset_name = 'dga-data-ngrams-sunburst'

download_dataset(dataset_name)
S = pd.read_csv(datasets_dir / '{}.csv'.format(dataset_name))
S.drop_duplicates(inplace=True)

M1 = Job.from_file(source=jobs_dir / 'dga-v2-baseline-model')

## 2. Split the sunburst dataset into the update dataset (U) and the test dataset (T)

In [7]:
U, T = train_test_split(S, test_size=0.2)

## 3. Evaluate model M1 on T

In [11]:
eval_M1 = evaluate(dataset_name, T, M1)
success = eval_M1.wait_to_complete()
if not success:
    print('Evaluation failed')
y_M1 = eval_M1.get_predictions()

0,1
stderr,output
"/Users/thomasveasey/ml-cpp/build/distribution/platform/darwin-x86_64/controller.app/Contents/MacOS/data_frame_analyzer --input /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpgprbwv0s --config /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpe72zb8ms --output /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpr4wnt3pq --validElasticLicenseKeyConfirmed true --restore /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpziqghqe3; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi Thomass-MBP-2:incremental_learning thomasveasey$ /Users/thomasveasey/ml-cpp/build/distribution/platform/darwin-x86_64/controller.app/Contents/MacOS/data_frame_analyzer --input /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpgprbwv0s --config /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpe72zb8ms --output /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpr4wnt3pq --validElasticLicenseKeyConfirmed true --restore /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpziqghqe3; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi 2021-10-01 13:40:49,250471 UTC [39769] DEBUG /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@152 data_frame_analyzer (64 bit): Version 8.0.0-SNAPSHOT (Build 7eb21303b2f861) Copyright (c) 2021 Elasticsearch BV 2021-10-01 13:40:49,255019 UTC [39769] DEBUG /Users/thomasveasey/ml-cpp/lib/seccomp/CSystemCallFilter_MacOSX.cc@107 macOS sandbox initialized 2021-10-01 13:45:51,922718 UTC [39769] DEBUG /Users/thomasveasey/ml-cpp/lib/api/CDataFrameAnalyzer.cc@113 Received 135006 rows 2021-10-01 13:46:02,957529 UTC [39769] INFO /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@260 [{""name"":""E_DFTPMEstimatedPeakMemoryUsage"",""description"":""The upfront estimate of the peak memory training the predictive model would use"",""value"":541063378} ,{""name"":""E_DFTPMPeakMemoryUsage"",""description"":""The peak memory training the predictive model used"",""value"":226827486} ,{""name"":""E_DFTPMTimeToTrain"",""description"":""The time it took to train the predictive model"",""value"":2583} ] 2021-10-01 13:46:02,957587 UTC [39769] DEBUG /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@265 ML data frame analyzer exiting Success Thomass-MBP-2:incremental_learning thomasveasey$",",{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""model_metadata"":{""total_feature_importance"":[],""hyperparameters"":[],""train_pr ,{""compressed_data_summarization"":{""doc_num"":0,""data_summarization"":""H4sIAAAAAAA ]"


Job succeeded


## 4. Update model M1 with U (M2) and evaluate on T

In [16]:
M2 = update(dataset_name, U, M1)
M2.wait_to_complete()

0,1
stderr,output
"/Users/thomasveasey/ml-cpp/build/distribution/platform/darwin-x86_64/controller.app/Contents/MacOS/data_frame_analyzer --input /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmp3c4s4_o9 --config /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpkag2q9ai --output /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpgv9yii2g --validElasticLicenseKeyConfirmed true --restore /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpmkxcja9d; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi Thomass-MBP-2:incremental_learning thomasveasey$ /Users/thomasveasey/ml-cpp/build/distribution/platform/darwin-x86_64/controller.app/Contents/MacOS/data_frame_analyzer --input /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmp3c4s4_o9 --config /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpkag2q9ai --output /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpgv9yii2g --validElasticLicenseKeyConfirmed true --restore /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpmkxcja9d; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi 2021-10-01 13:50:18,319596 UTC [40323] DEBUG /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@152 data_frame_analyzer (64 bit): Version 8.0.0-SNAPSHOT (Build 7eb21303b2f861) Copyright (c) 2021 Elasticsearch BV 2021-10-01 13:50:18,324109 UTC [40323] DEBUG /Users/thomasveasey/ml-cpp/lib/seccomp/CSystemCallFilter_MacOSX.cc@107 macOS sandbox initialized 2021-10-01 13:55:06,331589 UTC [40323] DEBUG /Users/thomasveasey/ml-cpp/lib/api/CDataFrameAnalyzer.cc@113 Received 136037 rows 2021-10-01 14:10:33,208892 UTC [40323] INFO /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@260 [{""name"":""E_DFTPMEstimatedPeakMemoryUsage"",""description"":""The upfront estimate of the peak memory training the predictive model would use"",""value"":543919940} ,{""name"":""E_DFTPMPeakMemoryUsage"",""description"":""The peak memory training the predictive model used"",""value"":660859361} ,{""name"":""E_DFTPMTimeToTrain"",""description"":""The time it took to train the predictive model"",""value"":915712} ] 2021-10-01 14:10:33,208943 UTC [40323] DEBUG /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@265 ML data frame analyzer exiting Success Thomass-MBP-2:incremental_learning thomasveasey$",",{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""model_metadata"":{""total_feature_importance"":[],""hyperparameters"":[{""name"":""re ,{""compressed_data_summarization"":{""doc_num"":0,""data_summarization"":""H4sIAAAAAAA ]"


Job succeeded


1219.8928697109222

In [26]:
eval_M2 = evaluate(dataset_name, T, M2)
success = eval_M2.wait_to_complete()
if not success:
    print('Evaluation failed')
y_M2 = eval_M2.get_predictions()

0,1
stderr,output
"/Users/thomasveasey/ml-cpp/build/distribution/platform/darwin-x86_64/controller.app/Contents/MacOS/data_frame_analyzer --input /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmp8prcsxnr --config /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpmtcxyog3 --output /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpokuzgk5e --validElasticLicenseKeyConfirmed true --restore /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpxhss2lrc; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi Thomass-MBP-2:incremental_learning thomasveasey$ /Users/thomasveasey/ml-cpp/build/distribution/platform/darwin-x86_64/controller.app/Contents/MacOS/data_frame_analyzer --input /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmp8prcsxnr --config /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpmtcxyog3 --output /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpokuzgk5e --validElasticLicenseKeyConfirmed true --restore /var/folders/xn/wrmsjpr53z9g9pqrrz12p2t80000gn/T/tmpxhss2lrc; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi 2021-10-01 14:19:54,183408 UTC [41861] DEBUG /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@152 data_frame_analyzer (64 bit): Version 8.0.0-SNAPSHOT (Build 7eb21303b2f861) Copyright (c) 2021 Elasticsearch BV 2021-10-01 14:19:54,188126 UTC [41861] DEBUG /Users/thomasveasey/ml-cpp/lib/seccomp/CSystemCallFilter_MacOSX.cc@107 macOS sandbox initialized 2021-10-01 14:23:39,850361 UTC [41861] DEBUG /Users/thomasveasey/ml-cpp/lib/api/CDataFrameAnalyzer.cc@113 Received 13947 rows 2021-10-01 14:23:47,320807 UTC [41861] INFO /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@260 [{""name"":""E_DFTPMEstimatedPeakMemoryUsage"",""description"":""The upfront estimate of the peak memory training the predictive model would use"",""value"":180296303} ,{""name"":""E_DFTPMPeakMemoryUsage"",""description"":""The peak memory training the predictive model used"",""value"":113607012} ,{""name"":""E_DFTPMTimeToTrain"",""description"":""The time it took to train the predictive model"",""value"":260} ] 2021-10-01 14:23:47,320860 UTC [41861] DEBUG /Users/thomasveasey/ml-cpp/bin/data_frame_analyzer/Main.cc@265 ML data frame analyzer exiting Success Thomass-MBP-2:incremental_learning thomasveasey$",",{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""0"",""predi ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""1"",""predi ,{""model_metadata"":{""total_feature_importance"":[],""hyperparameters"":[],""train_pr ,{""compressed_data_summarization"":{""doc_num"":0,""data_summarization"":""H4sIAAAAAAA ]"


Job succeeded


## 5. Compare M1 and M2 accuracy on T

In [45]:
y_true = pd.array(T[M1.dependent_variable].astype(str))

M1_accuracy = accuracy_score(y_true, y_M1)
M2_accuracy = accuracy_score(y_true, y_M2)

print("M1 accuracy:", M1_accuracy)
print("M2 accuracy:", M2_accuracy)

M1 accuracy: 0.0
M2 accuracy: 0.9680232558139535
