In [1]:
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License;
# you may not use this file except in compliance with the Elastic License.

# Scenario 1: Data summarization

1. Train model (M1) on the complete dataset (D1)

2. Generate a summarization dataset using some technique (D2)

3. Train a new model (M2) on D2
    1. Use the same best hyperparameters as for M1   
    2. **Identify a new set of best hyperparameters**
    
4. Compare M1 and M2
    1. Evaluation M1 and M2 on the complete dataset D1.
    2. Evaluate M1 and M2 on a grid and compute divergence
    3. Compare feature importance vectors for individual data points from M1 and M2 (should be very similar)
    4. Compare errors on the test dataset (D3)

**Notes:**
- We assume that generation of a summarization dataset is implemented. It can be written in Python for prototyping.
- 4 can be done with an inference pipeline and eland
- We need a simple way to query a model using inference pipeline (e.g. Python wrapper?)
- Evaluations can also be done with sklearn?
- What is the minimum amount of data we can get away with?


In [2]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error

from incremental_learning.misc import train, summarize, update, evaluate
from incremental_learning.config import datasets_dir, root_dir

## 1. Train model (M1) on the complete dataset (D1)

In [3]:
dataset_name = 'ccpp'
D1 = pd.read_csv(datasets_dir / '{}.csv'.format(dataset_name))
D1.drop_duplicates(inplace=True)

## sample 1000 rows just for test purposes!
D1 = D1.sample(100)

job1 = train(dataset_name, D1)

session: job_cyunr	command:
/home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpfx07d6__ --config /tmp/tmpfyw9hjjs --output /tmp/tmp_ektnwwe --persist /tmp/tmpt1_ks0s9; if [ $? -eq 0 ]; then echo "Success"; else echo "Failure";  fi;


In [4]:
job1.wait_to_complete()

0,1
stderr,output
"/home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpfx07d6__ --config /tmp/tmpfyw9hjjs --output /tmp/tmp_ektnwwe --persist /tmp/tmpt1_ks0s9; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi (env) ✔ ~/Documents/workspace/valeriy42/ml-cpp/jupyter/notebooks/evaluation_scenarios [python-tests L|●2✚ 2…2⚑ 6] 14:59 $ /home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpfx07d6__ --config /tmp/tmpfyw9hjjs --output /tmp/tmp_ektnwwe --persist /tmp/tmpt1_ks0s9; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi 2021-06-28 12:59:21,801795 UTC [21278] DEBUG Main.cc@145 data_frame_analyzer (64 bit): Version 8.0.0-SNAPSHOT (Build 0833c91c67e7db) Copyright (c) 2021 Elasticsearch BV 2021-06-28 12:59:21,801837 UTC [21278] DEBUG CProcessPriority_Linux.cc@33 Successfully increased OOM killer adjustment via /proc/self/oom_score_adj 2021-06-28 12:59:21,801847 UTC [21278] DEBUG CSystemCallFilter_Linux.cc@128 Seccomp BPF filters available 2021-06-28 12:59:21,802046 UTC [21278] DEBUG CSystemCallFilter_Linux.cc@154 Seccomp BPF installed 2021-06-28 12:59:21,809309 UTC [21278] DEBUG CDataFrameAnalyzer.cc@102 Received 9527 rows 2021-06-28 12:59:55,230202 UTC [21278] INFO CBoostedTreeImpl.cc@260 Exiting hyperparameter optimisation loop early 2021-06-28 12:59:58,053972 UTC [21278] INFO Main.cc@248 [{""name"":""E_DFTPMEstimatedPeakMemoryUsage"",""description"":""The upfront estimate of the peak memory training the predictive model would use"",""value"":27586043} ,{""name"":""E_DFTPMPeakMemoryUsage"",""description"":""The peak memory training the predictive model used"",""value"":14388224} ,{""name"":""E_DFTPMTimeToTrain"",""description"":""The time it took to train the predictive model"",""value"":35244} ,{""name"":""E_DFTPMTrainedForestNumberTrees"",""description"":""The total number of trees in the trained forest"",""value"":210} ] 2021-06-28 12:59:58,054004 UTC [21278] DEBUG Main.cc@253 ML data frame analyzer exiting Success (env) ✔ ~/Documents/workspace/valeriy42/ml-cpp/jupyter/notebooks/evaluation_scenarios [python-tests L|●2✚ 2…2⚑ 6] 14:59 $",",{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":464.3738708496094 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":468.8424682617187 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":459.07763671875,"" ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":469.3031616210937 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":429.9221496582031 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":438.406982421875, ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":452.0242919921875 ,{""model_metadata"":{""total_feature_importance"":[],""hyperparameters"":[{""name"":""do ,{""compressed_data_summarization"":{""doc_num"":0,""data_summarization"":""H4sIAAAAAAA ]"


 /home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpfx07d6__ --config /tmp/tmpfyw9hjjs --output /tmp/tmp_ektnwwe --persist /tmp/tmpt1_ks0s9; if [ $? -eq 0 ]; then echo "Success"; else echo "Failure";  fi
(env) ✔ ~/Documents/workspace/valeriy42/ml-cpp/jupyter/notebooks/evaluation_scenarios [python-tests L|●2✚ 2…2⚑ 6]
14:59 $  /home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpfx07d6__ --config /tmp/tmpfyw9hjjs --output /tmp/tmp_ektnwwe --persist /tmp/tmpt1_ks0s9; if [ $? -eq 0 ]; then echo "Success"; else echo "Failure";  fi
2021-06-28 12:59:21,801795 UTC [21278] DEBUG Main.cc@145 data_frame_analyzer (64 bit): Version 8.0.0-SNAPSHOT (Build 0833c91c67e7db) Copyright (c) 2021 Elasticsearch BV
2021-06-28 12:59:21,801837 UTC [21278] DEBUG CProcessPriority_Linux.cc@33 Successfully increased OOM killer adjustment via /proc/self/oom_s

True

## 2. Generate a sumarization dataset

In [6]:
sampling_method = 'random'
D2 = summarize(dataset_name=dataset_name, dataset=D1,
              size=0.25, model_definition=job1.get_model_definition(), 
              method=sampling_method, verbose=False, 
              dependent_variable=job1.dependent_variable)

## 3. C. Incremetally train a new model

In [7]:
job2 = update(dataset_name, D2, job1)

session: job_bllod	command:
/home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpirm6gxj9 --config /tmp/tmpuu53ifdr --output /tmp/tmptt1nhpbl --restore /tmp/tmp5q0rnxem; if [ $? -eq 0 ]; then echo "Success"; else echo "Failure";  fi;


In [8]:
job2.wait_to_complete()

0,1
stderr,output
"/home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpirm6gxj9 --config /tmp/tmpuu53ifdr --output /tmp/tmptt1nhpbl --restore /tmp/tmp5q0rnxem; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi (env) ✔ ~/Documents/workspace/valeriy42/ml-cpp/jupyter/notebooks/evaluation_scenarios [python-tests L|●2✚ 2…2⚑ 6] 15:01 $ /home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpirm6gxj9 --config /tmp/tmpuu53ifdr --output /tmp/tmptt1nhpbl --restore /tmp/tmp5q0rnxem; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi 2021-06-28 13:01:00,038082 UTC [29239] DEBUG Main.cc@145 data_frame_analyzer (64 bit): Version 8.0.0-SNAPSHOT (Build 0833c91c67e7db) Copyright (c) 2021 Elasticsearch BV 2021-06-28 13:01:00,038123 UTC [29239] DEBUG CProcessPriority_Linux.cc@33 Successfully increased OOM killer adjustment via /proc/self/oom_score_adj 2021-06-28 13:01:00,038133 UTC [29239] DEBUG CSystemCallFilter_Linux.cc@128 Seccomp BPF filters available 2021-06-28 13:01:00,039183 UTC [29239] DEBUG CSystemCallFilter_Linux.cc@154 Seccomp BPF installed 2021-06-28 13:01:01,266902 UTC [29239] DEBUG CDataFrameAnalyzer.cc@102 Received 3333 rows 2021-06-28 13:01:02,263974 UTC [29239] INFO Main.cc@248 [{""name"":""E_DFTPMEstimatedPeakMemoryUsage"",""description"":""The upfront estimate of the peak memory training the predictive model would use"",""value"":23511218} ,{""name"":""E_DFTPMPeakMemoryUsage"",""description"":""The peak memory training the predictive model used"",""value"":7897926} ,{""name"":""E_DFTPMTimeToTrain"",""description"":""The time it took to train the predictive model"",""value"":280} ] 2021-06-28 13:01:02,264005 UTC [29239] DEBUG Main.cc@253 ML data frame analyzer exiting Success (env) ✔ ~/Documents/workspace/valeriy42/ml-cpp/jupyter/notebooks/evaluation_scenarios [python-tests L|●2✚ 2…2⚑ 6] 15:01 $",",{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":440.5562133789062 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":463.5453186035156 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":470.7997436523437 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":434.1181945800781 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":434.8784790039062 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":446.3300170898437 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""PE_prediction"":444.5942993164062 ,{""model_metadata"":{""total_feature_importance"":[],""hyperparameters"":[{""name"":""do ,{""compressed_data_summarization"":{""doc_num"":0,""data_summarization"":""H4sIAAAAAAA ]"


 /home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpirm6gxj9 --config /tmp/tmpuu53ifdr --output /tmp/tmptt1nhpbl --restore /tmp/tmp5q0rnxem; if [ $? -eq 0 ]; then echo "Success"; else echo "Failure";  fi
(env) ✔ ~/Documents/workspace/valeriy42/ml-cpp/jupyter/notebooks/evaluation_scenarios [python-tests L|●2✚ 2…2⚑ 6]
15:01 $  /home/valeriy/Documents/workspace/valeriy42/ml-cpp/build/distribution/platform/linux-x86_64/bin/data_frame_analyzer --input /tmp/tmpirm6gxj9 --config /tmp/tmpuu53ifdr --output /tmp/tmptt1nhpbl --restore /tmp/tmp5q0rnxem; if [ $? -eq 0 ]; then echo "Success"; else echo "Failure";  fi
2021-06-28 13:01:00,038082 UTC [29239] DEBUG Main.cc@145 data_frame_analyzer (64 bit): Version 8.0.0-SNAPSHOT (Build 0833c91c67e7db) Copyright (c) 2021 Elasticsearch BV
2021-06-28 13:01:00,038123 UTC [29239] DEBUG CProcessPriority_Linux.cc@33 Successfully increased OOM killer adjustment via /proc/self/oom_s

True

## 4. A. Compare M1 and M2 on D1

In [9]:
y_true = D1[job1.dependent_variable]
y_M1 = job1.get_predictions()
eval_job = evaluate(dataset_name, D1, job2)
success = eval_job.wait_to_complete()
if not success:
    print('Evaluation failed')
y_M2 = eval_job.get_predictions()

Evaluation failed


In [10]:
def compute_metrics(ytrue, m1pred, m2pred):
    m1_mae = mean_absolute_error(ytrue, m1pred)
    m1_mse = mean_squared_error(ytrue, m1pred)
    m2_mae = mean_absolute_error(ytrue, m2pred)
    m2_mse = mean_squared_error(ytrue, m2pred)
    print("M1: MAE: {}\tMSE:{}".format(m1_mae, m1_mse))
    print("M2: MAE: {}\tMSE:{}".format(m2_mae, m2_mse))
    ax = sns.scatterplot(x=m1pred, y=m2pred)
    plt.xlabel('M1 predictions')
    plt.ylabel('M2 predictions')

In [None]:
compute_metrics(y_true, y_M1, y_M2)