In [1]:
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0 and the following additional limitation. Functionality enabled by the
# files subject to the Elastic License 2.0 may only be used in production when
# invoked by an Elasticsearch process with a license key installed that permits
# use of machine learning features. You may not use this file except in
# compliance with the Elastic License 2.0 and the foregoing additional
# limitation.

## Improving Malicious Macro Detection with False Positive Telemetry

1. Train a malicious macro detection model
2. Test the performance of this base model on the False Positives dataset
3. Incrementally updatee base model with False Positive Dataset

In [4]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix

from incremental_learning.config import datasets_dir, root_dir, jobs_dir
from incremental_learning.job import update, evaluate, Job, train
from incremental_learning.storage import download_dataset, download_job

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
def get_predictions(model: Job):
    probabilities = model.get_probabilities()
    predictions = np.array(list(map(lambda row: max(row, key=row.get), probabilities)))
    return predictions

In [6]:
full_dataset_name = 'base_macro_model_bool'
download_dataset(full_dataset_name)
D = pd.read_csv(datasets_dir / '{}.csv'.format(full_dataset_name))
D.drop_duplicates(inplace=True)

### Calculate class counts for malicious and benign macros

In [7]:
D['malicious'].value_counts()

False    423245
True     180167
Name: malicious, dtype: int64

### Split the base dataset into test and train

In [8]:
base_train, base_test = train_test_split(D, test_size=0.2)

### Get the Telemetry (Update) Dataset
(note: the update dataset consist of benign example, ie. for all malicious=False)

In [10]:
update_dataset_name = 'telemetry_fp_full'
download_dataset(update_dataset_name)
U = pd.read_csv(datasets_dir / '{}.csv'.format(update_dataset_name))
U.drop_duplicates(inplace=True)

2022-03-04 12:55:38,105 [I] incremental_learning >> File /root/data/configs/telemetry_fp_full.json does not exist.
2022-03-04 12:55:38,178 [I] incremental_learning >> Downloading configs/telemetry_fp_full.json from the Google storage bucket to /root/data/configs/telemetry_fp_full.json.
2022-03-04 12:55:38,310 [I] incremental_learning >> Retrieving datasets/telemetry_fp_full.csv from the Google storage bucket to /root/data/datasets/telemetry_fp_full.csv.


In [14]:
# after deduplication, we only have 1670 vectors available for update
len(U)

1670

In [16]:
# all of the examples in the false positives telemetry are benign, so malicious=False
U['malicious'].value_counts()

False    1670
Name: malicious, dtype: int64

### Split the update dataset 

In [24]:
telemetry_train, telemetry_test = train_test_split(U, test_size=0.2)

### Train base macro model

In [17]:
# since we already have a pre-trained job, let's load that from the cloud bucket
baseline_model_name = 'macro_baseline_model'
download_job(baseline_model_name)
job1 = Job.from_file(source=jobs_dir / 'macro_baseline_model')

2022-03-04 14:17:10,315 [I] incremental_learning >> Downloading jobs/macro_baseline_model from the Google storage bucket to /root/data/jobs/macro_baseline_model.


In [13]:
# if you don't have a pre-trained base-macro model, uncomment the lines in cells below and run the job
# job1 = train(full_dataset_name, base_train)

session: job_lbket/tmp/tmpd2vim5eq	command:
/ml-cpp/bin/data_frame_analyzer --input /tmp/tmps0i_ccwo --config /tmp/tmpd2vim5eq --output /tmp/tmpuobx8o47 --validElasticLicenseKeyConfirmed true --persist /tmp/tmp3kxyyj34; if [ $? -eq 0 ]; then echo "Success"; else echo "Failure";  fi;


In [14]:
# job1.wait_to_complete()

0,1
stderr,output
",{""name"":""E_DFTPMPeakMemoryUsage"",""description"":""The peak memory training the pr edictive model used"",""value"":1052650400} ,{""name"":""E_DFTPMTimeToTrain"",""description"":""The time it took to train the predi ctive model"",""value"":4024706} ,{""name"":""E_DFTPMTrainedForestNumberTrees"",""description"":""The total number of tr ees in the trained forest"",""value"":574} ] 2022-03-03 23:31:01,709453 UTC [483] DEBUG Main.cc@265 ML data frame analyzer ex iting Success [root@ml-camilla-jupyter-mlcpp-large incremental_learning]#",",{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""True"",""pr ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""True"",""pr ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""False"",""p ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""False"",""p ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""False"",""p ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""False"",""p ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""malicious_prediction"":""True"",""pr ,{""model_metadata"":{""total_feature_importance"":[],""hyperparameters"":[{""name"":""do ,{""compressed_data_summarization"":{""doc_num"":0,""data_summarization"":""H4sIAAAAAAA ]"


Job succeeded


4052.1752047538757

In [19]:
# if you want to store the results of the job, please uncomment and run this line
# job1.store(jobs_dir / 'Macro_base')

### Check base model performance on base test set
This is a quick check to see how well the model does on its own test data before update with telemetry

In [20]:
eval_base_model_on_test = evaluate(full_dataset_name, base_test, job1, verbose=False)

In [21]:
elapsed_time = eval_base_model_on_test.wait_to_complete()
print('Elapsed time {}'.format(datetime.timedelta(seconds=elapsed_time)))

Elapsed time 0:00:30.515148


In [22]:
y_base_test = get_predictions(eval_base_model_on_test)

In [23]:
y_true_base = pd.array(base_test[job1.dependent_variable].astype(str))
base_test_accuracy = accuracy_score(y_true_base, y_base_test)

In [24]:
base_test_accuracy

0.7762899497029407

### Check base model performance on telemetry false positives
Please note that to evaluate the base model performance on the telemetry dataset, we will be using the whole telemetry dataset stored in the variable "U" and not the test/train splits of the telemetry dataset

In [25]:
eval_base_model_on_telemetry = evaluate(update_dataset_name, U, job1, verbose=False)
elapsed_time = eval_base_model_on_telemetry.wait_to_complete()
y_base_telemetry = get_predictions(eval_base_model_on_telemetry)

In [26]:
y_true_telemetry = pd.array(U[job1.dependent_variable].astype(str))
base_telemetry_accuracy = accuracy_score(y_true_telemetry, y_base_telemetry)

In [27]:
base_telemetry_accuracy

0.7826347305389222

### Update baseline model with false positives

In [26]:
hyperparameters = job1.get_hyperparameters()
del hyperparameters['retrained_tree_eta']
job2 = update(update_dataset_name, telemetry_train, job1, verbose=False, hyperparameter_overrides=hyperparameters)
elapsed_time = job2.wait_to_complete()


In [28]:
# uncomment this line if you want to store the model locally
# job2.store(jobs_dir / 'macro_updated_model')

### Evaluate Updated Telemetry Macro Model with Telemetry Test set

In [29]:
eval_updated_model_on_telemetry = evaluate(update_dataset_name, telemetry_test, job2, verbose=False)
elapsed_time = eval_updated_model_on_telemetry.wait_to_complete()
print('Elapsed time {}'.format(datetime.timedelta(seconds=elapsed_time)))
y_job2 = get_predictions(eval_updated_model_on_telemetry)

Elapsed time 0:00:05.787311


In [30]:
y_true = pd.array(telemetry_test[job1.dependent_variable].astype(str))

In [31]:
job2_accuracy = accuracy_score(y_true, y_job2)

In [32]:
job2_accuracy

1.0

### Evaluate Updated Macro Model on Base Model Test Set

In [40]:
eval_updated_model_on_base_test = evaluate(update_dataset_name, base_test, job2, verbose=False)
elapsed_time = eval_updated_model_on_base_test.wait_to_complete()
print('Elapsed time {}'.format(datetime.timedelta(seconds=elapsed_time)))
y_job2_base_test = get_predictions(eval_updated_model_on_base_test)

Elapsed time 0:00:16.145269


In [41]:
updated_model_accuracy_on_base_test = accuracy_score(y_true_base, y_job2_base_test)
updated_model_accuracy_on_base_test

0.7756353421774401