In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

from sklearn.model_selection import train_test_split
from collections import namedtuple
import numpy as np
import pandas as pd
from incremental_learning.storage import read_dataset, upload_job 
from incremental_learning.job import train, update, evaluate, Job
from incremental_learning.config import jobs_dir, logger
from sklearn.metrics import mean_squared_error
logger.setLevel("ERROR")
import time
import diversipy

In [2]:
Run = namedtuple("Run",["config", "run_logger"])

In [3]:
run = Run(config={'threads':8, 'analysis':{'parameters':
                                           {'tree_topology_change_penalty': 0.0, 
                                            'prediction_change_cost': 0.0,
                                            'data_summarization_fraction': 1.0}}}, 
          run_logger=logger)

In [4]:
dataset_name = 'facebook'
original_dataset = read_dataset(dataset_name)

In [5]:
test_fraction = 0.2
training_fraction = 0.1
update_fraction = 0.1

train_dataset, test_dataset = train_test_split(original_dataset, test_size=test_fraction)
train_dataset = train_dataset.copy()
test_dataset = test_dataset.copy()

baseline_dataset = train_dataset.sample(frac=training_fraction)
update_num_samples = int(train_dataset.shape[0]*update_fraction)

In [6]:
baseline_dataset.shape[0], update_num_samples

(48192, 48192)

In [7]:
job1 = train(dataset_name, baseline_dataset, run=run, verbose=False)
job1.wait_to_complete()

207.33140349388123

In [8]:
job_name = "{}_basejob".format(dataset_name)
job_path = jobs_dir / job_name
job1.store(job_path)
# upload_job(job_path)

True

In [9]:
# job1 = Job.from_file(job_path)

In [10]:
def compute_test_error(job):
    dataset = test_dataset
    job_eval = evaluate(dataset_name=dataset_name, dataset=dataset, original_job=job, run=run, verbose=False)
    job_eval.wait_to_complete()
    predictions = job_eval.get_predictions()

    mse= mean_squared_error(dataset[job_eval.dependent_variable], predictions)
    return mse

In [11]:
def get_residuals(job):
    dataset = train_dataset
    job_eval = evaluate(dataset_name=dataset_name, dataset=dataset, original_job=job, run=run, verbose=False)
    job_eval.wait_to_complete()
    predictions = job_eval.get_predictions()
    residuals = np.absolute(dataset[job_eval.dependent_variable] - predictions)
    return residuals

In [12]:
print("mse: {:e}".format(compute_test_error(job1)))

mse: 4.477813e+02


In [13]:
job_prev = job1
for step in range(1,4):
    
    train_dataset['indicator'] = get_residuals(job_prev)
#     D_update = train_dataset.sample(n=2000)
#     D_update.drop(columns=['indicator'], inplace=True)
    largest = train_dataset.nlargest(n=update_num_samples, columns=['indicator'])
    largest.drop(columns=['indicator'], inplace=True)
    train_dataset.drop(columns=['indicator'], inplace=True)
    D_update=largest
#     dupdate = diversipy.subset.psa_select(largest.to_numpy(), update_num_samples)
#     D_update = pd.DataFrame(
#             data=dupdate, columns=largest.columns)



    job_update = update(dataset_name=dataset_name, dataset=D_update, original_job=job_prev, 
                        run=run, verbose=False, force=True)
    job_update.wait_to_complete()
    
    print("{}: {:e}".format(step, compute_test_error(job_update)))
    job_prev = job_update

1: 3.811826e+02
2: 3.587182e+02
3: 3.723074e+02
