In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as skl

from mylib import class_distributions
from mylib import data_selection
from mylib import helper_funcs

from mylib.pipelines import full_models

%load_ext autoreload
%autoreload 2

In [2]:
# open file

data_folder = Path("../../../data/ForestCoverDataset/")
file_to_open = data_folder / "covtype.data"


# read and prepare data

data = pd.read_csv(file_to_open, delimiter=",", header=None)

# add the header manually
header = {0: "Elevation", 1: "Aspect", 2: "Slope", 3: "Horizontal_Distance_To_Hydrology",
          4: "Vertical_Distance_To_Hydrology", 5: "Horizontal_Distance_To_Roadways",
          6: "Hillshade_9am", 7: "Hillshade_Noon", 8: "Hillshade_3pm", 9: "Horizontal_Distance_To_Fire_Points"}

# add the names of binary columns
for i in range(1, 5):
    header[9+i] = f"Wilderness_Area_{i}"

for i in range(1, 41):
    header[13+i] = f"Soil_Type_{i}"

header[54] = "Class"

#data = data.drop(range(10,54), axis=1)

data = data.rename(header, axis=1)
data["Class"] = data["Class"] - 1   # want 0-based index
data.to_csv(data_folder / 'ForestCoverDataset.csv', index=False)

In [3]:
data = pd.read_csv(data_folder / 'ForestCoverDataset.csv')

In [4]:
class_distributions.label_proportions(data['Class'])

1    0.487599
0    0.364605
2    0.061537
6    0.035300
5    0.029891
4    0.016339
3    0.004728
Name: Class, dtype: float64

In [5]:
import time

filepath = data_folder / 'ForestCoverDataset.csv'
training_method = 'continued_training'
new_class_idx = 1
num_models = 3
num_round = 10
max_depth = 5

start = time.perf_counter()
full_models.full_models(filepath,
                        training_method,
                        new_class_idx,
                        num_models,
                        num_round=10,
                        max_depth=3)
end = time.perf_counter()

print(end-start)

Training full models in preparation to add class 1 using the continued_training training method
Accuracy of full model on old data:  0.631109812312388
Accuracy of full model on new data:  0.7950801444400125
Accuracy of full model on full data:  0.7107561766907912
220.67184063599962
