In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

from sklearn.model_selection import train_test_split
from collections import namedtuple
import numpy as np
import pandas as pd
from incremental_learning.storage import read_dataset, upload_job 
from incremental_learning.job import train, update, evaluate, Job
from incremental_learning.config import jobs_dir, logger
from sklearn.metrics import mean_squared_error
logger.setLevel("ERROR")
import time
import diversipy

In [2]:
Run = namedtuple("Run",["config", "run_logger"])

In [3]:
# Create a mock-up Run object with configurations similar to those used in the sacred library.
run = Run(config={'threads':8, 'analysis':{'parameters':
                                           {'tree_topology_change_penalty': 0.0, 
                                            'prediction_change_cost': 0.0,
                                            'data_summarization_fraction': 1.0}}}, 
          run_logger=logger)

In [5]:
# usual data wrangling. We are going to train and update using samples from train_dataset and evaluate
# error on the test_dataset.

dataset_name = 'house'
original_dataset = read_dataset(dataset_name)

test_fraction = 0.2
training_fraction = 0.1
update_fraction = 0.1

train_dataset, test_dataset = train_test_split(original_dataset, test_size=test_fraction)
train_dataset = train_dataset.copy()
test_dataset = test_dataset.copy()

baseline_dataset = train_dataset.sample(frac=training_fraction)
update_num_samples = int(train_dataset.shape[0]*update_fraction)

In [7]:
# train the first job
job1 = train(dataset_name, baseline_dataset, run=run, verbose=False)
job1.wait_to_complete()

25.25991916656494

In [8]:
# optionally store/upload the job file for expensive jobs
job_name = "{}_basejob".format(dataset_name)
job_path = jobs_dir / job_name
job1.store(job_path)
# upload_job(job_path)

True

In [9]:
def compute_test_error(job):
    dataset = test_dataset
    job_eval = evaluate(dataset_name=dataset_name, dataset=dataset, original_job=job, run=run, verbose=False)
    job_eval.wait_to_complete()
    predictions = job_eval.get_predictions()

    mse= mean_squared_error(dataset[job_eval.dependent_variable], predictions)
    return mse

In [10]:
def get_residuals(job):
    dataset = train_dataset
    job_eval = evaluate(dataset_name=dataset_name, dataset=dataset, original_job=job, run=run, verbose=False)
    job_eval.wait_to_complete()
    predictions = job_eval.get_predictions()
    residuals = np.absolute(dataset[job_eval.dependent_variable] - predictions)
    return residuals

In [11]:
# First test error
print("mse: {:e}".format(compute_test_error(job1)))

mse: 1.279372e+09


In [None]:
# conduct 3 update steps and evaluate the test error
job_prev = job1
for step in range(1,4):
    
    # here we select 3 times of update_num_sample largest examples and then diverify using the 
    # same number update_num_sample to ensure that the update examples do not cluster in the same region.
    train_dataset['indicator'] = get_residuals(job_prev)
    largest = train_dataset.nlargest(n=3*update_num_samples, columns=['indicator'])
    largest.drop(columns=['indicator'], inplace=True)
    train_dataset.drop(columns=['indicator'], inplace=True)
    dupdate = diversipy.subset.psa_select(largest.to_numpy(), update_num_samples)
    D_update = pd.DataFrame(
            data=dupdate, columns=largest.columns)



    job_update = update(dataset_name=dataset_name, dataset=D_update, original_job=job_prev, 
                        run=run, verbose=False, force=True)
    job_update.wait_to_complete()
    
    print("{}: {:e}".format(step, compute_test_error(job_update)))
    job_prev = job_update

1: 1.375783e+09
2: 1.425051e+09
