In [1]:
from pathlib import Path
import pandas as pd

from mylib import helper_funcs
from mylib import class_distributions

from mylib.pipelines import full_models
from mylib.pipelines import updating_pipeline

%load_ext autoreload
%autoreload 2

In [2]:
data_folder = Path('../../../data/AdultDataset/')
data_file = 'adult.data'
name_file = 'adult.names'

In [3]:
data = pd.read_csv(data_folder / data_file, delimiter=',', header=None)

In [4]:
# add the header manually
header = {0: "age", 1: "workclass", 2: "fnlwgt", 3: "education",
          4: "education-num", 5: "marital-status",
          6: "occupation", 7: "relationship", 8: "race", 9: "sex",
          10: "capital-gain", 11: "capital-loss", 12: 'hours-per-week',
          13: 'native-country', 14: 'income'}

data = data.rename(header, axis=1)

In [5]:
class_column = 'marital-status'

data = data.rename(columns={class_column: 'Class'})
data = helper_funcs.create_numbered_categories(data, 'Class')

for column in data.columns:
    if column not in ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']:
        data = helper_funcs.create_numbered_categories(data, column)

In [6]:
class_distributions.label_proportions(data['Class'])

1    0.459937
0    0.328092
2    0.136452
4    0.031479
6    0.030497
3    0.012837
5    0.000706
Name: Class, dtype: float64

In [7]:
data.to_csv(data_folder / 'AdultDataset.csv', index=False)

In [8]:
filepath = data_folder / 'AdultDataset.csv'
data_selection_method = 'entropy'
new_class_idx = 1
num_models = 3
num_round = 20
max_depth = 5

In [11]:
for training_method in ['continued_training', 'add_trees']:
    for sort_type in ['closest', 'furthest']:

        full_models.full_models(filepath,
                                training_method,
                                new_class_idx,
                                num_models,
                                num_round,
                                max_depth)
        
        updating_pipeline.updating_pipeline(filepath,
                                              training_method,
                                              new_class_idx,
                                              data_selection_method,
                                              sort_type,
                                              num_models,
                                              num_round,
                                              max_depth)

Training full models in preparation to add class 1 using the continued_training training method
Accuracy of full model on old data:  0.735513221495593
Accuracy of full model on new data:  0.9859330484330484
Accuracy of full model on full data:  0.8440042990941193
Adding class 1 with continued_training
Used data selection method: entropy. Sort type: closest
Current target proportion of old data in use: 0.1
Current target proportion of old data in use: 0.2
Current target proportion of old data in use: 0.30000000000000004
Current target proportion of old data in use: 0.4
Current target proportion of old data in use: 0.5
Current target proportion of old data in use: 0.6000000000000001
Current target proportion of old data in use: 0.7000000000000001
Current target proportion of old data in use: 0.8
Current target proportion of old data in use: 0.9
Training full models in preparation to add class 1 using the continued_training training method
Accuracy of full model on old data:  0.7355132214

In [None]:
largest_or_smallest_class = 'largest class'

for training_method in ['continued_training', 'add_trees']:
    batch_results = helper_funcs.unpack_batch_results("continued_training", largest_or_smallest_class)


    for sort_type in ['closest', 'furthest']:
        experiment_results = helper_funcs.unpack_results("continued_training", 'entropy', sort_type, largest_or_smallest_class)
        helper_funcs.plot_results("continued_training", experiment_results, batch_results, 'entropy', sort_type, largest_or_smallest_class, save=True)